From e5a0e16f0ff384fd551f0d26226776516cf9d9e7 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 27 Jan 2026 18:55:03 +0100
Subject: [PATCH 1/4] docs: add evaluator playground migration planning
 workspace

---
 .../migrate-evaluator-playground/README.md    |  40 ++
 .../migrate-evaluator-playground/context.md   |  57 +++
 .../current-system.md                         | 230 +++++++++
 .../migration-options.md                      | 125 +++++
 .../new-endpoints.md                          | 428 +++++++++++++++++
 .../migrate-evaluator-playground/plan.md      | 450 ++++++++++++++++++
 .../migrate-evaluator-playground/research.md  | 211 ++++++++
 .../risk-analysis.md                          | 287 +++++++++++
 .../migrate-evaluator-playground/status.md    | 134 ++++++
 9 files changed, 1962 insertions(+)
 create mode 100644 docs/design/migrate-evaluator-playground/README.md
 create mode 100644 docs/design/migrate-evaluator-playground/context.md
 create mode 100644 docs/design/migrate-evaluator-playground/current-system.md
 create mode 100644 docs/design/migrate-evaluator-playground/migration-options.md
 create mode 100644 docs/design/migrate-evaluator-playground/new-endpoints.md
 create mode 100644 docs/design/migrate-evaluator-playground/plan.md
 create mode 100644 docs/design/migrate-evaluator-playground/research.md
 create mode 100644 docs/design/migrate-evaluator-playground/risk-analysis.md
 create mode 100644 docs/design/migrate-evaluator-playground/status.md

diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md
new file mode 100644
index 0000000000..4197c667d1
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/README.md
@@ -0,0 +1,40 @@
+# Migrate Evaluator Playground to New Evaluator Endpoints
+
+## Overview
+
+This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model, and has created backward-compatible legacy endpoints. This migration will update the frontend to use the new endpoints directly.
+
+## Context
+
+- **PR #3527**: Backend migration that introduces new evaluator endpoints while keeping legacy endpoints for backward compatibility
+- **Goal**: Migrate the Evaluator Playground frontend to use new endpoints, improving consistency with the new workflow-based architecture
+
+## Documents
+
+| File | Description |
+|------|-------------|
+| [context.md](./context.md) | Background, motivation, problem statement, goals, and non-goals |
+| [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation |
+| [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy |
+| [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers |
+| [migration-options.md](./migration-options.md) | Migration plan options: direct vs transitional approaches |
+| [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration |
+| [plan.md](./plan.md) | Migration execution plan with phases and milestones |
+| [status.md](./status.md) | Living document for progress updates and decisions |
+
+## Key Files Affected
+
+### Frontend - Core Components
+- `web/oss/src/components/Evaluators/` - Evaluators registry
+- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` - Playground UI
+- `web/oss/src/services/evaluators/index.ts` - API service layer
+- `web/oss/src/services/evaluations/api_ee/index.ts` - Evaluator run execution
+
+### Frontend - State Management
+- `web/oss/src/state/evaluators/atoms.ts` - Evaluator query atoms
+- `web/oss/src/lib/atoms/evaluation.ts` - Legacy evaluation atoms
+
+### Backend Reference (PR #3527)
+- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept for backward compatibility)
+- `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router
+- `api/oss/src/core/evaluators/dtos.py` - New data transfer objects
diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md
new file mode 100644
index 0000000000..4f2fdae9be
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/context.md
@@ -0,0 +1,57 @@
+# Context: Migrate Evaluator Playground
+
+## Background
+
+The Agenta platform has undergone a significant architectural change where **evaluators are now workflows**. This means evaluators follow the same git-like versioning model as other workflows:
+- **Artifact** (Evaluator) → **Variant** → **Revision**
+
+Previously, evaluators were stored in a flat `EvaluatorConfigDB` table with simple key-value settings. The new model stores evaluators as `WorkflowArtifactDBE`, `WorkflowVariantDBE`, and `WorkflowRevisionDBE` records with richer metadata and versioning.
+
+## Motivation
+
+1. **Unified Architecture**: Evaluators, testsets, and apps now share the same git-like workflow model
+2. **Better Versioning**: Evaluators can have multiple variants and revision history
+3. **Richer Metadata**: New model supports URIs, schemas, scripts, and configuration in a structured way
+4. **Future Extensibility**: Custom evaluators will be first-class citizens with the same capabilities as built-in ones
+
+## Problem Statement
+
+The Evaluator Playground frontend currently uses legacy endpoints:
+- `GET /evaluators/` - List evaluator templates
+- `GET/POST/PUT/DELETE /evaluators/configs/` - CRUD for evaluator configurations
+- `POST /evaluators/{key}/run/` - Run evaluator in playground
+
+The backend (PR #3527) has:
+1. Migrated all evaluator configs to the new workflow-based model via DB migrations
+2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/`
+3. Kept legacy endpoints as thin wrappers that convert new model back to legacy format
+
+**The frontend needs to migrate to use the new endpoints directly.**
+
+## Goals
+
+1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints
+2. **Update data models** in frontend to match new `SimpleEvaluator` shape
+3. **Maintain backward compatibility** during transition (feature flag or gradual rollout)
+4. **Keep the evaluator run endpoint** (`/evaluators/{key}/run/`) - this remains unchanged
+5. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality
+
+## Non-Goals
+
+1. **Not migrating the evaluator run endpoint** - The `/evaluators/{key}/run/` endpoint is still used and works the same way
+2. **Not changing the Evaluator Playground UI** - Only the data layer changes
+3. **Not migrating evaluation batch runs** - Those use evaluator revision IDs which are handled by the backend migration
+4. **Not introducing new evaluator features** - This is a pure backend migration
+
+## Success Criteria
+
+1. Evaluator Playground can create, edit, delete evaluators using new endpoints
+2. All existing evaluator configurations continue to work
+3. No regression in evaluator testing functionality
+4. Clean removal of legacy endpoint usage in frontend
+
+## Constraints
+
+1. Must not break existing evaluator configurations
+2. Must coordinate with backend team on endpoint availability
+3. Should be deployable incrementally (not big-bang)
diff --git a/docs/design/migrate-evaluator-playground/current-system.md b/docs/design/migrate-evaluator-playground/current-system.md
new file mode 100644
index 0000000000..7797d76ec4
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/current-system.md
@@ -0,0 +1,230 @@
+# Current System: Evaluator Playground
+
+## Overview
+
+The Evaluator Playground allows users to:
+1. **Browse** evaluator templates (built-in evaluators)
+2. **Create/Configure** evaluator configurations with custom settings
+3. **Test** evaluators by running them against app variants and test cases
+4. **Manage** (edit, clone, delete) existing evaluator configurations
+
+## File Structure
+
+### Entry Points (Pages)
+
+| Path | Purpose |
+|------|---------|
+| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/index.tsx` | Evaluators list page |
+| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/configure/[evaluator_id].tsx` | Configure evaluator page |
+
+### Core Components
+
+#### Evaluators Registry (`/web/oss/src/components/Evaluators/`)
+
+| File | Purpose |
+|------|---------|
+| `index.tsx` | Main registry with table, search, tabs (automatic/human) |
+| `hooks/useEvaluatorsRegistryData.ts` | Fetches and transforms evaluator data |
+| `assets/getColumns.tsx` | Table column definitions |
+| `components/SelectEvaluatorModal/` | Modal to select evaluator template for new config |
+| `components/ConfigureEvaluator/index.tsx` | Page wrapper that loads data and initializes atoms |
+| `components/DeleteEvaluatorsModal/` | Delete confirmation modal |
+
+#### ConfigureEvaluator (Main UI) 
+
+Location: `/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/`
+
+| File | Purpose |
+|------|---------|
+| `index.tsx` | Configuration form + test panel layout |
+| `DebugSection.tsx` | Test evaluator panel (run variant, run evaluator) |
+| `DynamicFormField.tsx` | Renders settings fields based on evaluator template |
+| `AdvancedSettings.tsx` | Collapsible advanced parameters |
+| `state/atoms.ts` | Jotai atoms for playground state |
+| `variantUtils.ts` | Utility for building variants from revisions |
+
+### State Management
+
+#### Playground Atoms (`state/atoms.ts`)
+
+```typescript
+// Session state
+playgroundSessionAtom          // { evaluator, existingConfigId, mode }
+playgroundEvaluatorAtom        // Current evaluator template (derived)
+playgroundIsEditModeAtom       // Is editing existing config? (derived)
+playgroundIsCloneModeAtom      // Is cloning config? (derived)
+playgroundEditValuesAtom       // Current config values being edited
+
+// Form state
+playgroundFormRefAtom          // Ant Design Form instance
+
+// Test section state
+playgroundSelectedVariantAtom  // Selected variant for testing
+playgroundSelectedTestsetIdAtom // Selected testset ID
+playgroundSelectedRevisionIdAtom // Selected revision ID
+playgroundSelectedTestcaseAtom // Testcase data
+playgroundTraceTreeAtom        // Trace output from running variant
+
+// Persisted state (localStorage)
+playgroundLastAppIdAtom        // Last used app ID
+playgroundLastVariantIdAtom    // Last used variant ID
+
+// Action atoms
+initPlaygroundAtom             // Initialize playground state
+resetPlaygroundAtom            // Reset all state
+commitPlaygroundAtom           // Update state after save
+cloneCurrentConfigAtom         // Switch to clone mode
+```
+
+#### Global Evaluator Atoms (`/web/oss/src/state/evaluators/atoms.ts`)
+
+```typescript
+evaluatorConfigsQueryAtomFamily // Query for evaluator configs
+evaluatorsQueryAtomFamily       // Query for evaluator templates
+nonArchivedEvaluatorsAtom       // Derived: non-archived evaluators
+evaluatorByKeyAtomFamily        // Find evaluator by key
+```
+
+### API Service Layer
+
+#### Evaluators Service (`/web/oss/src/services/evaluators/index.ts`)
+
+```typescript
+// Evaluator Templates (legacy)
+fetchAllEvaluators()           // GET /evaluators
+
+// Evaluator Configs (legacy)
+fetchAllEvaluatorConfigs()     // GET /evaluators/configs
+createEvaluatorConfig()        // POST /evaluators/configs
+updateEvaluatorConfig()        // PUT /evaluators/configs/{id}
+deleteEvaluatorConfig()        // DELETE /evaluators/configs/{id}
+
+// Custom/Human Evaluators (new)
+createEvaluator()              // POST /preview/simple/evaluators/
+updateEvaluator()              // PUT /preview/simple/evaluators/{id}
+fetchEvaluatorById()           // GET /preview/simple/evaluators/{id}
+deleteHumanEvaluator()         // POST /preview/simple/evaluators/{id}/archive
+```
+
+#### Evaluator Run Service (`/web/oss/src/services/evaluations/api_ee/index.ts`)
+
+```typescript
+createEvaluatorDataMapping()   // POST /evaluators/map
+createEvaluatorRunExecution()  // POST /evaluators/{key}/run
+```
+
+## Data Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           USER ACTIONS                                       │
+│  - Browse evaluators list                                                   │
+│  - Create new evaluator config                                              │
+│  - Edit existing evaluator config                                           │
+│  - Test evaluator with variant + testcase                                   │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  ENTRY POINTS                                                                │
+│  /evaluators → EvaluatorsRegistry                                           │
+│       ├─ Uses useEvaluatorsRegistryData() hook                              │
+│       │     ├─ Calls fetchAllEvaluators() → GET /evaluators                 │
+│       │     └─ Calls fetchAllEvaluatorConfigs() → GET /evaluators/configs   │
+│       │                                                                      │
+│       ├─ "Create new" → SelectEvaluatorModal → /evaluators/configure/new    │
+│       └─ Click row → /evaluators/configure/{id}                             │
+│                                                                              │
+│  /evaluators/configure/{id} → ConfigureEvaluatorPage                        │
+│       ├─ Loads evaluator template & existing config                         │
+│       ├─ Initializes playgroundSessionAtom                                  │
+│       └─ Renders ConfigureEvaluator component                               │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  ConfigureEvaluator                                                          │
+│  ┌─────────────────────────────┐  ┌─────────────────────────────┐           │
+│  │  LEFT: Configuration Form   │  │  RIGHT: DebugSection        │           │
+│  │  - Name input               │  │  - Testcase selector        │           │
+│  │  - DynamicFormField[]       │  │  - Variant selector         │           │
+│  │  - AdvancedSettings         │  │  - Run variant button       │           │
+│  │  - Commit/Reset buttons     │  │  - Run evaluator button     │           │
+│  └─────────────────────────────┘  └─────────────────────────────┘           │
+│                                                                              │
+│  Commit Actions:                                                             │
+│  - Create: POST /evaluators/configs → createEvaluatorConfig()               │
+│  - Update: PUT /evaluators/configs/{id} → updateEvaluatorConfig()           │
+│                                                                              │
+│  Test Actions:                                                               │
+│  - Run Variant: callVariant() → POST to variant URL                         │
+│  - Run Evaluator: createEvaluatorRunExecution()                             │
+│                   → POST /evaluators/{key}/run                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Current API Endpoints Used
+
+### Legacy Endpoints (to be migrated)
+
+| Endpoint | Method | Frontend Function | Purpose |
+|----------|--------|-------------------|---------|
+| `/evaluators/` | GET | `fetchAllEvaluators()` | List evaluator templates |
+| `/evaluators/configs/` | GET | `fetchAllEvaluatorConfigs()` | List evaluator configs |
+| `/evaluators/configs/` | POST | `createEvaluatorConfig()` | Create new config |
+| `/evaluators/configs/{id}/` | PUT | `updateEvaluatorConfig()` | Update existing config |
+| `/evaluators/configs/{id}/` | DELETE | `deleteEvaluatorConfig()` | Delete config |
+
+### Endpoints That Remain Unchanged
+
+| Endpoint | Method | Frontend Function | Purpose |
+|----------|--------|-------------------|---------|
+| `/evaluators/map/` | POST | `createEvaluatorDataMapping()` | Map trace data for RAG evaluators |
+| `/evaluators/{key}/run/` | POST | `createEvaluatorRunExecution()` | Run evaluator (test) |
+
+### Already Using New Endpoints (for custom evaluators)
+
+| Endpoint | Method | Frontend Function | Purpose |
+|----------|--------|-------------------|---------|
+| `/preview/simple/evaluators/` | POST | `createEvaluator()` | Create custom evaluator |
+| `/preview/simple/evaluators/{id}` | PUT | `updateEvaluator()` | Update custom evaluator |
+| `/preview/simple/evaluators/{id}` | GET | `fetchEvaluatorById()` | Fetch evaluator by ID |
+| `/preview/simple/evaluators/{id}/archive` | POST | `deleteHumanEvaluator()` | Archive human evaluator |
+
+## Data Types
+
+### Current EvaluatorConfig (Legacy)
+
+```typescript
+interface EvaluatorConfig {
+    id: string
+    evaluator_key: string
+    name: string
+    settings_values: Record<string, any>
+    created_at: string
+    updated_at: string
+    color?: string
+    tags?: string[]
+    // Frontend additions
+    icon_url?: string | StaticImageData
+}
+```
+
+### Current Evaluator Template (Legacy)
+
+```typescript
+interface Evaluator {
+    name: string
+    key: string
+    settings_presets?: SettingsPreset[]
+    settings_template: Record<string, EvaluationSettingsTemplate>
+    icon_url?: string | StaticImageData
+    color?: string
+    direct_use?: boolean
+    description: string
+    oss?: boolean
+    requires_llm_api_keys?: boolean
+    tags: string[]
+    archived?: boolean
+}
+```
diff --git a/docs/design/migrate-evaluator-playground/migration-options.md b/docs/design/migrate-evaluator-playground/migration-options.md
new file mode 100644
index 0000000000..6ea44db0de
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/migration-options.md
@@ -0,0 +1,125 @@
+# Migration Options (Plan A vs Plan B)
+
+## Goal
+
+Full migration of the Evaluator Playground to the new workflow-based evaluator APIs, including:
+- CRUD on evaluator configs via `/preview/simple/evaluators/*` (or the richer `/preview/evaluators/*` family)
+- Running evaluators via native workflow invocation (`/preview/workflows/invoke`) instead of the legacy `/evaluators/{key}/run`
+
+This doc lists two concrete migration strategies.
+
+---
+
+## Plan A (Transitional): Keep Internal Shapes Stable
+
+This is the earlier approach: keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary.
+
+### Why it exists
+
+- Minimizes touching UI/atoms/forms
+- Lets you swap endpoints quickly with limited regression surface
+- Good when backend is still stabilizing schemas
+
+### Trade-offs
+
+- Adds an extra abstraction layer (adapters)
+- Can delay paying down legacy assumptions (`settings_values`, `evaluator_key`, etc.)
+
+---
+
+## Plan B (Preferred): Direct Migration (No Adapters)
+
+This changes the frontend domain model to match the backend reality:
+- “Evaluator config” becomes `SimpleEvaluator` (workflow artifact w/ latest evaluator revision data attached)
+- Execution uses workflow invocation (`/preview/workflows/invoke`) using evaluator `data.uri`
+
+### Why it’s better long-term
+
+- Eliminates translation debt
+- Aligns with “evaluators are workflows” concept end-to-end
+- Unlocks revision-aware runs and custom evaluator URIs
+
+### Initial Scope (not exhaustive)
+
+#### 1) Data model and type changes
+
+- Introduce TS types for `SimpleEvaluator*` and `WorkflowService*` (request/response)
+- Replace usages of `EvaluatorConfig` in the evaluator playground path with `SimpleEvaluator`
+
+Key places:
+- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts`
+- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx`
+- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts`
+
+#### 2) CRUD endpoint swap (configs)
+
+Replace:
+- `GET/POST/PUT/DELETE /evaluators/configs/*`
+
+With:
+- `POST /preview/simple/evaluators/query`
+- `POST /preview/simple/evaluators/`
+- `PUT /preview/simple/evaluators/{id}`
+- `POST /preview/simple/evaluators/{id}/archive`
+
+Key files:
+- `web/oss/src/services/evaluators/index.ts`
+- `web/oss/src/state/evaluators/atoms.ts`
+
+Notes:
+- `evaluator_key` is now derived from `simpleEvaluator.data.uri` (or carried separately in UI state)
+- Settings are now `simpleEvaluator.data.parameters`
+
+#### 3) Run endpoint swap (native invoke)
+
+Replace:
+- `POST /evaluators/{evaluator_key}/run`
+
+With:
+- `POST /preview/workflows/invoke`
+
+What needs changing in the playground:
+- `DebugSection.tsx` currently uses `createEvaluatorRunExecution(evaluatorKey, {inputs, settings})`
+- New call should construct `WorkflowServiceRequest`:
+  - `interface.uri` (or `configuration`+`interface`) derived from evaluator `data` / built-in key
+  - `data.inputs` (merged testcase + prediction)
+  - `data.outputs` (prediction/output)
+  - `data.parameters` (settings)
+
+Key file:
+- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx`
+- plus a new service client, e.g. `web/oss/src/services/workflows/invoke.ts`
+
+#### 4) Registry/list UI adjustments
+
+The evaluators registry table expects legacy `evaluator_key` and `settings_values`. Under Plan B:
+- The list source becomes `SimpleEvaluator[]`
+- Table columns need to read from `data.uri` and `data.parameters`
+
+Key files:
+- `web/oss/src/components/Evaluators/index.tsx`
+- `web/oss/src/components/Evaluators/assets/getColumns.tsx`
+- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts`
+
+#### 5) Permissions and error handling
+
+Native invoke uses `RUN_WORKFLOWS` permission (backend check). Expect:
+- Different 403 behavior for some users
+- Different error shape: workflow service returns `status.code/message` in response
+
+UI needs:
+- Map workflow error status to `message.error` and output editor
+
+---
+
+## Practical Recommendation
+
+If the objective is “duplicate all endpoints and fully migrate”, Plan B is the right destination.
+
+To reduce risk while still avoiding adapters, a pragmatic sequencing is:
+
+1) Migrate CRUD to SimpleEvaluator endpoints (Plan B)
+2) Keep legacy run for 1-2 PRs while CRUD stabilizes
+3) Migrate run to `/preview/workflows/invoke` (Plan B completion)
+
+This keeps changes reviewable without introducing a permanent adapter layer.
diff --git a/docs/design/migrate-evaluator-playground/new-endpoints.md b/docs/design/migrate-evaluator-playground/new-endpoints.md
new file mode 100644
index 0000000000..05231c4813
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/new-endpoints.md
@@ -0,0 +1,428 @@
+# New Evaluator Endpoints
+
+## Overview
+
+The new evaluator system treats evaluators as **workflows** with git-like versioning. The `SimpleEvaluator` API provides a simplified interface that abstracts the underlying workflow structure.
+
+## Key Architectural Change
+
+**Evaluators are now workflows identified by URIs.**
+
+URI Format: `agenta:builtin:{evaluator_key}:v0`
+
+Example: `agenta:builtin:auto_exact_match:v0`
+
+The SDK has a `HANDLER_REGISTRY` that maps URIs to actual handler functions. This enables:
+- Native workflow invocation via URI
+- Custom evaluators with user-defined URIs (`user:custom:my_evaluator:latest`)
+- Version management of evaluator implementations
+
+## Evaluator Execution Paths
+
+### Option 1: Legacy Run Endpoint (Maintained for Backward Compatibility)
+
+```
+POST /evaluators/{evaluator_key}/run/
+```
+
+**Request:**
+```typescript
+interface EvaluatorInputInterface {
+    inputs: Record<string, any>    // prediction, ground_truth, etc.
+    settings: Record<string, any>  // evaluator configuration
+    credentials?: Record<string, any>
+}
+```
+
+**Response:**
+```typescript
+interface EvaluatorOutputInterface {
+    outputs: Record<string, any>  // score, success, etc.
+}
+```
+
+**Internal Implementation (PR #3527):**
+```python
+async def _run_evaluator(evaluator_key: str, evaluator_input):
+    # Build URI from evaluator_key
+    uri = f"agenta:builtin:{evaluator_key}:v0"
+    
+    # Retrieve handler from SDK registry
+    handler = retrieve_handler(uri)
+    
+    # Invoke handler directly
+    result = handler(inputs=inputs, outputs=outputs, parameters=settings)
+    
+    return {"outputs": result}
+```
+
+### Option 2: Native Workflow Invoke Endpoint
+
+```
+POST /preview/workflows/invoke
+```
+
+**Request:**
+```typescript
+interface WorkflowServiceRequest {
+    data: {
+        inputs: Record<string, any>
+        outputs?: any
+        parameters?: Record<string, any>  // settings
+    }
+    revision?: {
+        data?: {
+            uri: string  // e.g., "agenta:builtin:auto_exact_match:v0"
+            parameters?: Record<string, any>
+        }
+    }
+}
+```
+
+**Response:**
+```typescript
+interface WorkflowServiceBatchResponse {
+    data: {
+        outputs: Record<string, any>
+    }
+    status?: {
+        code: number
+        message: string
+    }
+}
+```
+
+### Option 3: Evaluator Revision-Based Invoke
+
+For a fully "native" approach:
+
+1. **Fetch the evaluator revision:**
+   ```
+   POST /preview/evaluators/revisions/retrieve
+   ```
+   
+2. **Get the URI from revision data:**
+   ```typescript
+   const uri = evaluatorRevision.data.uri  // "agenta:builtin:auto_exact_match:v0"
+   ```
+
+3. **Invoke via workflow service:**
+   ```
+   POST /preview/workflows/invoke
+   ```
+
+## Comparison: Which Approach to Use?
+
+| Aspect | Legacy Run | Native Invoke | Revision-Based |
+|--------|------------|---------------|----------------|
+| **Simplicity** | High | Medium | Low |
+| **Frontend Changes** | Minimal | Medium | Significant |
+| **Architecture Alignment** | Legacy | Native | Most Native |
+| **Flexibility** | Low | High | High |
+| **Custom Evaluators** | Limited | Full Support | Full Support |
+| **Requires URI** | No (uses key) | Yes | Yes (fetched) |
+
+**Recommendation:** 
+
+For the Evaluator Playground migration:
+- **Short-term:** Keep using legacy `/evaluators/{key}/run/` - it works the same and the backend handles URI resolution internally
+- **Long-term:** Consider migrating to native workflow invoke when supporting custom evaluators or revision-specific execution
+
+---
+
+## New SimpleEvaluator CRUD Endpoints
+
+Base path: `/preview/simple/evaluators/`
+
+| Endpoint | Method | Purpose |
+|----------|--------|---------|
+| `/preview/simple/evaluators/` | POST | Create new evaluator |
+| `/preview/simple/evaluators/{id}` | GET | Fetch evaluator by ID |
+| `/preview/simple/evaluators/{id}` | PUT | Update evaluator |
+| `/preview/simple/evaluators/{id}/archive` | POST | Archive (soft delete) evaluator |
+| `/preview/simple/evaluators/{id}/unarchive` | POST | Restore archived evaluator |
+| `/preview/simple/evaluators/query` | POST | Query evaluators with filters |
+
+## Data Structures
+
+### SimpleEvaluator (Response)
+
+```python
+class SimpleEvaluator:
+    id: UUID
+    slug: str
+    
+    # Lifecycle
+    created_at: datetime
+    updated_at: datetime
+    
+    # Header
+    name: Optional[str]
+    description: Optional[str]
+    
+    # Metadata
+    tags: Optional[List[str]]
+    meta: Optional[dict]
+    
+    # Flags
+    flags: Optional[SimpleEvaluatorFlags]
+    
+    # Data (revision data)
+    data: Optional[SimpleEvaluatorData]
+```
+
+### SimpleEvaluatorData (Revision Configuration)
+
+```python
+class SimpleEvaluatorData:
+    # Version
+    version: Optional[str]  # e.g., "2025.07.14"
+    
+    # Service Interface - THE KEY FIELD
+    uri: Optional[str]      # e.g., "agenta:builtin:auto_exact_match:v0"
+    url: Optional[str]      # For webhook evaluators
+    headers: Optional[Dict[str, Union[Reference, str]]]
+    
+    # Schema definitions
+    schemas: Optional[Dict[str, Schema]]  # e.g., {"outputs": {...}}
+    
+    # Configuration
+    script: Optional[dict]      # For custom code: {"content": "...", "runtime": "python"}
+    parameters: Optional[dict]  # Settings values (same as legacy settings_values)
+    
+    # Legacy fields (for backward compatibility)
+    service: Optional[dict]
+    configuration: Optional[dict]
+```
+
+### URI-based Handler Registry
+
+The SDK maintains registries that map URIs to implementations:
+
+```python
+HANDLER_REGISTRY = {
+    "agenta": {
+        "builtin": {
+            "echo": {"v0": echo_v0},
+            "auto_exact_match": {"v0": auto_exact_match_v0},
+            "auto_regex_test": {"v0": auto_regex_test_v0},
+            # ... all built-in evaluators
+        }
+    },
+    "user": {
+        "custom": {
+            # User-defined evaluators go here
+        }
+    }
+}
+```
+
+Retrieve handler by URI:
+```python
+handler = retrieve_handler("agenta:builtin:auto_exact_match:v0")
+```
+
+---
+
+## Endpoint Comparison: Old vs New (CRUD)
+
+### List Evaluator Configs
+
+**Old:**
+```
+GET /evaluators/configs/?project_id={project_id}
+
+Response: EvaluatorConfig[]
+{
+    id: string
+    name: string
+    evaluator_key: string
+    settings_values: object
+    created_at: string
+    updated_at: string
+}
+```
+
+**New:**
+```
+POST /preview/simple/evaluators/query?project_id={project_id}
+
+Request: SimpleEvaluatorQuery
+{
+    flags?: { is_evaluator: true }
+}
+
+Response: SimpleEvaluatorsResponse
+{
+    count: number
+    evaluators: SimpleEvaluator[]
+}
+```
+
+### Create Evaluator Config
+
+**Old:**
+```
+POST /evaluators/configs/?project_id={project_id}
+
+Request: NewEvaluatorConfig
+{
+    name: string
+    evaluator_key: string
+    settings_values: object
+}
+
+Response: EvaluatorConfig
+```
+
+**New:**
+```
+POST /preview/simple/evaluators/?project_id={project_id}
+
+Request: SimpleEvaluatorCreateRequest
+{
+    evaluator: {
+        slug: string       # Generated from name
+        name: string
+        flags: { is_evaluator: true }
+        data: {
+            uri: "agenta:builtin:{evaluator_key}:v0"
+            parameters: object  # settings_values
+            schemas: { outputs: object }  # Output schema
+        }
+    }
+}
+
+Response: SimpleEvaluatorResponse
+{
+    count: number
+    evaluator: SimpleEvaluator
+}
+```
+
+### Update Evaluator Config
+
+**Old:**
+```
+PUT /evaluators/configs/{id}/?project_id={project_id}
+
+Request: UpdateEvaluatorConfig
+{
+    name?: string
+    settings_values?: object
+}
+
+Response: EvaluatorConfig
+```
+
+**New:**
+```
+PUT /preview/simple/evaluators/{id}?project_id={project_id}
+
+Request: SimpleEvaluatorEditRequest
+{
+    evaluator: {
+        id: UUID
+        name?: string
+        data?: {
+            parameters?: object  # settings_values
+        }
+    }
+}
+
+Response: SimpleEvaluatorResponse
+```
+
+### Delete Evaluator Config
+
+**Old:**
+```
+DELETE /evaluators/configs/{id}/?project_id={project_id}
+
+Response: boolean
+```
+
+**New:**
+```
+POST /preview/simple/evaluators/{id}/archive?project_id={project_id}
+
+Response: SimpleEvaluatorResponse
+```
+
+---
+
+## Key Differences Summary
+
+### 1. URI-based Evaluator Identification
+
+**Old:** `evaluator_key: "auto_exact_match"`
+
+**New:** `uri: "agenta:builtin:auto_exact_match:v0"`
+
+The URI enables:
+- Version management (`v0`, `v1`, etc.)
+- Custom evaluators (`user:custom:my_eval:latest`)
+- Handler registry lookup
+
+### 2. Settings Location
+
+**Old:** `settings_values: { threshold: 0.5 }`
+
+**New:** `data.parameters: { threshold: 0.5 }`
+
+### 3. Output Schema (New)
+
+The new model includes explicit output schemas:
+
+```python
+data.schemas = {
+    "outputs": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "type": "object",
+        "properties": {
+            "score": {"type": "number"},
+            "success": {"type": "boolean"}
+        }
+    }
+}
+```
+
+### 4. Soft Delete vs Hard Delete
+
+- **Old:** Hard delete (`DELETE`)
+- **New:** Soft delete via archive (`POST .../archive`)
+
+### 5. Response Wrapper
+
+**Old:** Returns data directly
+
+**New:** Returns wrapped response: `{ count: number, evaluator: SimpleEvaluator }`
+
+---
+
+## Frontend Mapping Requirements
+
+To migrate, the frontend needs to:
+
+1. **When creating an evaluator:**
+   - Generate `slug` from name
+   - Build `uri` from `evaluator_key`: `"agenta:builtin:{evaluator_key}:v0"`
+   - Move `settings_values` to `data.parameters`
+   - Set `flags.is_evaluator = true`
+   - Optionally include `data.schemas.outputs`
+
+2. **When reading evaluators:**
+   - Extract `evaluator_key` from `uri` (parse the third segment)
+   - Read settings from `data.parameters`
+   - Unwrap response from `{ evaluator: ... }`
+
+3. **When updating:**
+   - Include `id` in request body
+   - Update `data.parameters` for settings changes
+
+4. **When deleting:**
+   - Use `POST .../archive` instead of `DELETE`
+
+5. **When running evaluators:**
+   - **Option A (Recommended):** Keep using `/evaluators/{key}/run/` - no change needed
+   - **Option B (Native):** Use `/preview/workflows/invoke` with URI from revision
diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md
new file mode 100644
index 0000000000..35a1f9eb4a
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/plan.md
@@ -0,0 +1,450 @@
+# Migration Plan: Evaluator Playground
+
+## Overview
+
+This plan outlines an incremental migration approach that minimizes risk and allows for gradual rollout. The key principle is **transform at boundaries** - keep internal data shapes stable and only change API interactions.
+
+## Migration Strategy
+
+Two viable strategies exist:
+
+- Plan A (transitional): adapter pattern, keep internal legacy `EvaluatorConfig` shape
+- Plan B (preferred destination): direct migration, internal shapes become `SimpleEvaluator` + native invoke
+
+This file documents Plan A as the low-risk execution plan. For the direct plan, see `docs/design/migrate-evaluator-playground/migration-options.md`.
+
+## Plan A: Adapter Pattern
+
+Instead of changing data shapes throughout the codebase, we'll:
+1. Create adapter functions at the API boundary
+2. New endpoints return `SimpleEvaluator`, adapters convert to internal `EvaluatorConfig` shape
+3. Internal components continue working unchanged
+4. Gradually update internals later (optional)
+
+```
+┌──────────────┐     ┌──────────────┐     ┌──────────────────┐
+│  New API     │ ──► │   Adapter    │ ──► │  Internal Shape  │
+│  Endpoints   │     │   Layer      │     │  (unchanged)     │
+└──────────────┘     └──────────────┘     └──────────────────┘
+```
+
+---
+
+## Phase 1: Foundation (Low Risk)
+
+**Goal:** Create adapter layer and new service functions without changing existing code
+
+### Tasks
+
+#### 1.1 Create Type Definitions
+
+**File:** `web/oss/src/lib/Types.ts` or new file `web/oss/src/services/evaluators/types.ts`
+
+```typescript
+// New API types
+interface SimpleEvaluatorData {
+    version?: string
+    uri?: string
+    url?: string
+    headers?: Record<string, string>
+    schemas?: { outputs?: Record<string, any> }
+    script?: { content: string; runtime: string }
+    parameters?: Record<string, any>
+}
+
+interface SimpleEvaluatorFlags {
+    is_custom?: boolean
+    is_evaluator?: boolean
+    is_human?: boolean
+}
+
+interface SimpleEvaluator {
+    id: string
+    slug: string
+    name?: string
+    description?: string
+    tags?: string[]
+    meta?: Record<string, any>
+    flags?: SimpleEvaluatorFlags
+    data?: SimpleEvaluatorData
+    created_at: string
+    updated_at: string
+}
+
+interface SimpleEvaluatorResponse {
+    count: number
+    evaluator: SimpleEvaluator | null
+}
+
+interface SimpleEvaluatorsResponse {
+    count: number
+    evaluators: SimpleEvaluator[]
+}
+```
+
+#### 1.2 Create Adapter Functions
+
+**File:** `web/oss/src/services/evaluators/adapters.ts`
+
+```typescript
+import { EvaluatorConfig } from "@/oss/lib/Types"
+import { SimpleEvaluator, SimpleEvaluatorData } from "./types"
+import { getTagColors } from "@/oss/lib/helpers/colors"
+import { stringToNumberInRange } from "@/oss/lib/helpers/utils"
+
+/**
+ * Extract evaluator_key from URI
+ * URI format: "agenta:builtin:{key}:v0"
+ */
+export function extractEvaluatorKey(uri: string | undefined): string {
+    if (!uri) return ""
+    const parts = uri.split(":")
+    if (parts.length >= 3 && parts[0] === "agenta" && parts[1] === "builtin") {
+        return parts[2]
+    }
+    return ""
+}
+
+/**
+ * Build URI from evaluator key
+ */
+export function buildEvaluatorUri(evaluatorKey: string): string {
+    return `agenta:builtin:${evaluatorKey}:v0`
+}
+
+/**
+ * Convert SimpleEvaluator to internal EvaluatorConfig shape
+ */
+export function simpleEvaluatorToConfig(
+    simple: SimpleEvaluator,
+    projectId?: string
+): EvaluatorConfig {
+    const tagColors = getTagColors()
+    const evaluatorKey = extractEvaluatorKey(simple.data?.uri)
+    
+    return {
+        id: simple.id,
+        name: simple.name || "",
+        evaluator_key: evaluatorKey,
+        settings_values: simple.data?.parameters || {},
+        created_at: simple.created_at,
+        updated_at: simple.updated_at,
+        // Frontend additions
+        color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)],
+        tags: simple.tags,
+    }
+}
+
+/**
+ * Convert internal EvaluatorConfig to SimpleEvaluator create payload
+ */
+export function configToSimpleEvaluatorCreate(
+    config: Omit<EvaluatorConfig, "id" | "created_at">,
+    outputsSchema?: Record<string, any>
+): SimpleEvaluatorCreate {
+    return {
+        slug: generateSlug(config.name),
+        name: config.name,
+        flags: { is_evaluator: true },
+        data: {
+            uri: buildEvaluatorUri(config.evaluator_key),
+            parameters: config.settings_values,
+            schemas: outputsSchema ? { outputs: outputsSchema } : undefined,
+        },
+    }
+}
+
+/**
+ * Generate slug from name
+ */
+function generateSlug(name: string): string {
+    return name
+        .toLowerCase()
+        .replace(/[^a-z0-9]+/g, "-")
+        .replace(/^-|-$/g, "")
+}
+```
+
+#### 1.3 Create New Service Functions
+
+**File:** `web/oss/src/services/evaluators/index.ts` (add to existing)
+
+```typescript
+// === NEW ENDPOINT FUNCTIONS ===
+
+export const fetchAllEvaluatorConfigsV2 = async (
+    projectIdOverride?: string | null,
+): Promise<EvaluatorConfig[]> => {
+    const {projectId: projectIdFromStore} = getProjectValues()
+    const projectId = projectIdOverride ?? projectIdFromStore
+
+    if (!projectId) return []
+
+    const response = await axios.post(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`,
+        { flags: { is_evaluator: true } }
+    )
+    
+    const evaluators = response.data?.evaluators || []
+    return evaluators.map((e: SimpleEvaluator) => simpleEvaluatorToConfig(e, projectId))
+}
+
+export const createEvaluatorConfigV2 = async (
+    config: CreateEvaluationConfigData,
+): Promise<EvaluatorConfig> => {
+    const {projectId} = getProjectValues()
+    
+    const payload = configToSimpleEvaluatorCreate(config)
+    
+    const response = await axios.post(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`,
+        payload,
+    )
+    
+    const simple = response.data?.evaluator
+    if (!simple) throw new Error("Failed to create evaluator")
+    
+    return simpleEvaluatorToConfig(simple, projectId)
+}
+
+export const updateEvaluatorConfigV2 = async (
+    configId: string,
+    config: Partial<CreateEvaluationConfigData>,
+): Promise<EvaluatorConfig> => {
+    const {projectId} = getProjectValues()
+
+    const payload: SimpleEvaluatorEdit = {
+        id: configId,
+        name: config.name,
+        data: config.settings_values 
+            ? { parameters: config.settings_values }
+            : undefined,
+    }
+
+    const response = await axios.put(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`,
+        payload,
+    )
+    
+    const simple = response.data?.evaluator
+    if (!simple) throw new Error("Failed to update evaluator")
+    
+    return simpleEvaluatorToConfig(simple, projectId)
+}
+
+export const deleteEvaluatorConfigV2 = async (configId: string): Promise<boolean> => {
+    const {projectId} = getProjectValues()
+
+    await axios.post(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`,
+    )
+    
+    return true
+}
+```
+
+**Deliverables:**
+- [ ] Type definitions for new API shapes
+- [ ] Adapter functions (both directions)
+- [ ] New service functions with V2 suffix
+- [ ] Unit tests for adapters
+
+**Estimated Effort:** 1-2 days
+
+---
+
+## Phase 2: Feature Flag Integration (Low Risk)
+
+**Goal:** Add feature flag to toggle between old and new endpoints
+
+### Tasks
+
+#### 2.1 Add Feature Flag
+
+**File:** `web/oss/src/lib/helpers/featureFlags.ts` or environment config
+
+```typescript
+export const USE_NEW_EVALUATOR_ENDPOINTS = 
+    process.env.NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS === "true"
+```
+
+#### 2.2 Create Unified Service Functions
+
+**File:** `web/oss/src/services/evaluators/index.ts`
+
+```typescript
+// Unified functions that use feature flag
+export const fetchAllEvaluatorConfigs = async (...args) => {
+    if (USE_NEW_EVALUATOR_ENDPOINTS) {
+        return fetchAllEvaluatorConfigsV2(...args)
+    }
+    return fetchAllEvaluatorConfigsLegacy(...args)
+}
+
+export const createEvaluatorConfig = async (...args) => {
+    if (USE_NEW_EVALUATOR_ENDPOINTS) {
+        return createEvaluatorConfigV2(...args)
+    }
+    return createEvaluatorConfigLegacy(...args)
+}
+
+// ... same for update and delete
+```
+
+**Deliverables:**
+- [ ] Feature flag configuration
+- [ ] Unified service functions with flag branching
+- [ ] Documentation for enabling flag
+
+**Estimated Effort:** 0.5 days
+
+---
+
+## Phase 3: Integration Testing (Medium Risk)
+
+**Goal:** Verify new endpoints work correctly with existing UI
+
+### Tasks
+
+#### 3.1 Enable Feature Flag in Development
+
+- Set `NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS=true` in dev environment
+- Test all evaluator playground flows
+
+#### 3.2 Test Cases
+
+1. **List Evaluators**
+   - [ ] Registry shows all existing evaluator configs
+   - [ ] Correct names, types, and icons displayed
+   - [ ] Filtering and search work
+
+2. **Create Evaluator**
+   - [ ] Select template → Configure → Commit
+   - [ ] Settings saved correctly
+   - [ ] Redirects to edit page after create
+
+3. **Edit Evaluator**
+   - [ ] Load existing config
+   - [ ] Form populated with current values
+   - [ ] Update settings
+   - [ ] Changes persisted
+
+4. **Delete Evaluator**
+   - [ ] Delete confirmation works
+   - [ ] Evaluator removed from list
+   - [ ] No errors
+
+5. **Test Evaluator**
+   - [ ] Load testcase
+   - [ ] Run variant
+   - [ ] Run evaluator
+   - [ ] Results displayed correctly
+
+**Deliverables:**
+- [ ] Test results document
+- [ ] Bug fixes for any issues found
+- [ ] Performance comparison (if applicable)
+
+**Estimated Effort:** 2-3 days
+
+---
+
+## Phase 4: Gradual Rollout (Low Risk)
+
+**Goal:** Enable new endpoints for subset of users
+
+### Tasks
+
+#### 4.1 Staged Rollout
+
+1. **Internal testing:** Enable for team members only
+2. **Beta users:** Enable for opt-in users
+3. **General availability:** Enable for all users
+
+#### 4.2 Monitoring
+
+- Monitor error rates for evaluator operations
+- Track API response times
+- Watch for unexpected 404/500 errors
+
+**Deliverables:**
+- [ ] Rollout schedule
+- [ ] Rollback procedure documented
+- [ ] Monitoring dashboards/alerts
+
+**Estimated Effort:** 1-2 weeks (elapsed time)
+
+---
+
+## Phase 5: Cleanup (Low Risk)
+
+**Goal:** Remove legacy code and feature flag
+
+### Tasks
+
+#### 5.1 Remove Legacy Functions
+
+- Remove `fetchAllEvaluatorConfigsLegacy`
+- Remove `createEvaluatorConfigLegacy`
+- Remove `updateEvaluatorConfigLegacy`
+- Remove `deleteEvaluatorConfigLegacy`
+
+#### 5.2 Remove Feature Flag
+
+- Remove feature flag checks
+- Clean up V2 suffix from function names
+
+#### 5.3 Update Documentation
+
+- Update API documentation
+- Update developer docs
+
+**Deliverables:**
+- [ ] Legacy code removed
+- [ ] Feature flag removed
+- [ ] Documentation updated
+- [ ] PR for cleanup
+
+**Estimated Effort:** 1 day
+
+---
+
+## Timeline Summary
+
+| Phase | Duration | Risk | Dependencies |
+|-------|----------|------|--------------|
+| Phase 1: Foundation | 1-2 days | Low | None |
+| Phase 2: Feature Flag | 0.5 days | Low | Phase 1 |
+| Phase 3: Integration Testing | 2-3 days | Medium | Phase 2, Backend PR merged |
+| Phase 4: Gradual Rollout | 1-2 weeks | Low | Phase 3 |
+| Phase 5: Cleanup | 1 day | Low | Phase 4 complete |
+
+**Total Implementation Time:** ~5-7 days
+**Total Rollout Time:** ~2-3 weeks
+
+---
+
+## Rollback Plan
+
+If issues are discovered after deployment:
+
+1. **Immediate:** Set feature flag to `false`
+2. **Short-term:** Deploy hotfix to disable new endpoints
+3. **Investigation:** Analyze issues with new endpoints
+4. **Resolution:** Fix and re-test before re-enabling
+
+---
+
+## Open Questions
+
+1. **Output Schema Generation:** Should the frontend generate output schemas when creating evaluators, or should the backend handle this?
+   - Current PR shows backend generates schemas during migration
+   - Frontend may need to include schema for new configs
+
+2. **Slug Generation:** Should slugs be generated client-side or server-side?
+   - Server-side is safer (uniqueness checks)
+   - Client-side is faster (no round-trip)
+
+3. **Error Handling:** How should the frontend handle validation errors from new endpoints?
+   - New endpoints may return different error shapes
+   - Need to map to user-friendly messages
diff --git a/docs/design/migrate-evaluator-playground/research.md b/docs/design/migrate-evaluator-playground/research.md
new file mode 100644
index 0000000000..eda511d37b
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/research.md
@@ -0,0 +1,211 @@
+# Research Notes: Evaluator Execution Architecture
+
+## Findings from PR #3527 Investigation
+
+### Discovery: Native Evaluator Execution Path
+
+The new architecture treats evaluators as workflows with URI-based identification. The key discovery is that even the legacy `/evaluators/{key}/run/` endpoint now uses the native handler registry internally.
+
+### Handler Registry Architecture
+
+The SDK maintains a global registry of workflow handlers:
+
+**Location:** `sdk/agenta/sdk/workflows/utils.py`
+
+```python
+HANDLER_REGISTRY = {
+    "agenta": {
+        "builtin": {
+            "echo": {"v0": echo_v0},
+            "auto_exact_match": {"v0": auto_exact_match_v0},
+            "auto_regex_test": {"v0": auto_regex_test_v0},
+            "field_match_test": {"v0": field_match_test_v0},
+            "json_multi_field_match": {"v0": json_multi_field_match_v0},
+            "auto_webhook_test": {"v0": auto_webhook_test_v0},
+            "auto_custom_code_run": {"v0": auto_custom_code_run_v0},
+            "auto_ai_critique": {"v0": auto_ai_critique_v0},
+            # ... more evaluators
+        }
+    },
+    "user": {
+        "custom": {
+            # Custom user evaluators
+        }
+    }
+}
+```
+
+**URI Format:** `provider:kind:key:version`
+
+Examples:
+- `agenta:builtin:auto_exact_match:v0`
+- `user:custom:my_custom_eval:latest`
+
+**URI Parsing:**
+```python
+def parse_uri(uri: str) -> Tuple[provider, kind, key, version]:
+    # "agenta:builtin:echo:v0" → ("agenta", "builtin", "echo", "v0")
+```
+
+### How the Legacy Run Endpoint Works Now (PR #3527)
+
+**File:** `api/oss/src/routers/evaluators_router.py`
+
+The PR changed the implementation to use the native handler registry:
+
+```python
+@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface)
+async def evaluator_run(request: Request, evaluator_key: str, payload: EvaluatorInputInterface):
+    # ... auth setup ...
+    result = await _run_evaluator(evaluator_key, payload)
+    return result
+
+async def _run_evaluator(evaluator_key: str, evaluator_input: EvaluatorInputInterface):
+    # Build URI from evaluator_key
+    uri = f"agenta:builtin:{evaluator_key}:v0"
+    
+    # Retrieve the handler from SDK registry
+    handler = retrieve_handler(uri)
+    if handler is None:
+        raise NotImplementedError(f"Evaluator {evaluator_key} not found (uri={uri})")
+    
+    # Extract data from evaluator_input
+    inputs = evaluator_input.inputs or {}
+    settings = evaluator_input.settings or {}
+    outputs = inputs.get("prediction", inputs.get("output"))
+    
+    # Build kwargs based on handler signature
+    sig = inspect.signature(handler)
+    kwargs = {}
+    if "parameters" in sig.parameters:
+        kwargs["parameters"] = settings
+    if "inputs" in sig.parameters:
+        kwargs["inputs"] = inputs
+    if "outputs" in sig.parameters:
+        kwargs["outputs"] = outputs
+    
+    # Invoke the handler
+    result = handler(**kwargs)
+    if inspect.iscoroutine(result):
+        result = await result
+    
+    return {"outputs": result}
+```
+
+**Key Insight:** The legacy endpoint is now a thin wrapper that:
+1. Builds the URI from the evaluator_key
+2. Looks up the handler in the registry
+3. Invokes it directly
+
+### Native Workflow Invoke Path
+
+For fully native execution, there's also a generic workflow invoke endpoint:
+
+**Endpoint:** `POST /preview/workflows/invoke`
+
+**Request Structure:**
+```python
+class WorkflowServiceRequest:
+    data: WorkflowServiceRequestData  # inputs, outputs, parameters
+    revision: Optional[dict]           # contains URI in data.uri
+```
+
+**How Batch Evaluations Use It:**
+
+**File:** `api/oss/src/core/evaluations/tasks/legacy.py` (lines 1185-1228)
+
+```python
+workflow_service_request_data = WorkflowServiceRequestData(
+    inputs=inputs,
+    outputs=outputs,
+    #
+    parameters=evaluator_reference.get("configuration"),  # settings
+)
+
+workflow_service_request = WorkflowServiceRequest(
+    data=workflow_service_request_data,
+    #
+    environment=environment,
+    revision=evaluator_reference.get("revision"),  # contains URI
+)
+
+await workflows_service.invoke_workflow(
+    project_id=project_id,
+    user_id=user_id,
+    request=workflow_service_request,
+)
+```
+
+### Implications for Frontend Migration
+
+#### For Evaluator CRUD (Create/Read/Update/Delete)
+
+**Must migrate to new endpoints** because:
+- Legacy endpoints now call SimpleEvaluator endpoints internally
+- Data is stored in new workflow-based format
+- Frontend should use native API to avoid translation overhead
+
+#### For Evaluator Run (Testing in Playground)
+
+**Options:**
+
+1. **Keep using `/evaluators/{key}/run/`** (Recommended for now)
+   - Simplest approach
+   - Endpoint still works
+   - Internally uses native path
+   - No frontend changes needed
+
+2. **Use native workflow invoke**
+   - Requires building `WorkflowServiceRequest`
+   - Need to include evaluator revision with URI
+   - More complex but more "correct"
+   - Enables custom evaluator support
+
+3. **Hybrid approach**
+   - Use legacy endpoint for built-in evaluators
+   - Use native invoke for custom evaluators (which will have custom URIs)
+
+### Questions Resolved
+
+**Q: Why does the legacy run endpoint remain unchanged?**
+
+A: It's not unchanged internally - PR #3527 refactored it to use the native handler registry. But the external interface (URL, request/response format) is preserved for backward compatibility.
+
+**Q: Is there a "native" way to run evaluators?**
+
+A: Yes, via the workflow invoke endpoint with `WorkflowServiceRequest` containing the evaluator's URI. But for the playground, the legacy endpoint is simpler and equivalent.
+
+**Q: Should we migrate the run endpoint usage?**
+
+A: Not necessarily. The benefits of migrating would be:
+- Consistency with new architecture
+- Support for custom evaluators with custom URIs
+- Ability to run specific evaluator revisions
+
+But the costs are:
+- More complex payload construction
+- Need to fetch evaluator revision to get URI
+- No immediate user-facing benefit
+
+**Recommendation:** Keep using legacy run endpoint for now, plan native invoke for custom evaluator feature.
+
+## Note on "Qdrant changes"
+
+Within this repository, Qdrant appears in examples and cookbook/tutorial code (e.g., `examples/python/*`, `docs/docs/tutorials/*`), but not in the core evaluator/workflow execution path under `api/oss/src`.
+
+Implication for this migration:
+- Migrating the evaluator playground to `/preview/workflows/invoke` does not require any Qdrant-specific frontend changes.
+- Any Qdrant-related behavior is part of the *application/workflow being evaluated* (e.g., a RAG app calling Qdrant), and would surface only through normal workflow invocation inputs/outputs/traces.
+
+---
+
+## Related Files Analyzed
+
+- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (now with native internals)
+- `api/oss/src/apis/fastapi/evaluators/router.py` - New SimpleEvaluators router
+- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint
+- `api/oss/src/core/workflows/service.py` - Workflow invocation service
+- `api/oss/src/core/evaluations/tasks/legacy.py` - Batch evaluation using native invoke
+- `sdk/agenta/sdk/workflows/utils.py` - Handler registry and URI parsing
+- `sdk/agenta/sdk/workflows/interfaces.py` - Evaluator interfaces (schemas)
+- `sdk/agenta/sdk/workflows/handlers.py` - Actual evaluator implementations
diff --git a/docs/design/migrate-evaluator-playground/risk-analysis.md b/docs/design/migrate-evaluator-playground/risk-analysis.md
new file mode 100644
index 0000000000..0bd037f0a0
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/risk-analysis.md
@@ -0,0 +1,287 @@
+# Risk Analysis: Evaluator Playground Migration
+
+## Coupling Points
+
+### 1. State Management Coupling
+
+**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts`
+
+**Risk Level:** MEDIUM
+
+The playground state is tightly coupled to the `EvaluatorConfig` shape:
+
+```typescript
+// playgroundEditValuesAtom expects EvaluatorConfig shape
+interface EvaluatorConfig {
+    id: string
+    evaluator_key: string
+    name: string
+    settings_values: Record<string, any>
+}
+```
+
+**Impact:** 
+- `commitPlaygroundAtom` expects `EvaluatorConfig` as input
+- `playgroundEditValuesAtom` is read throughout ConfigureEvaluator and DebugSection
+- Form initialization relies on `settings_values` property name
+
+**Mitigation:**
+- Create adapter functions to convert between `SimpleEvaluator` and internal state
+- Or update atoms to use `SimpleEvaluator` shape and update all consumers
+
+---
+
+### 2. Form Initialization Coupling
+
+**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx`
+
+**Risk Level:** MEDIUM
+
+Form initialization directly accesses `settings_values`:
+
+```typescript
+// Line 383-410
+if (editMode && editEvalEditValues) {
+    form.setFieldsValue({
+        ...editEvalEditValues,
+        settings_values: editEvalEditValues.settings_values || {},
+    })
+}
+```
+
+**Impact:**
+- Changing to `data.parameters` would break form binding
+- DynamicFormField components use `["settings_values", field.key]` name paths
+
+**Mitigation:**
+- Keep internal form structure as `settings_values` 
+- Transform on API boundary (adapter pattern)
+
+---
+
+### 3. Service Layer Coupling
+
+**Location:** `web/oss/src/services/evaluators/index.ts`
+
+**Risk Level:** LOW-MEDIUM
+
+API calls directly construct legacy payload shapes:
+
+```typescript
+// createEvaluatorConfig
+return axios.post(`/evaluators/configs?project_id=${projectId}`, {
+    ...config,
+})
+
+// updateEvaluatorConfig  
+return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config)
+```
+
+**Impact:**
+- Need to update URLs and payload transformation
+- Response handling needs to unwrap `{ evaluator: ... }` wrapper
+
+**Mitigation:**
+- Create new service functions for new endpoints
+- Keep old functions temporarily for gradual migration
+- Add response/request transformers
+
+---
+
+### 4. Evaluators Registry Coupling
+
+**Location:** `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts`
+
+**Risk Level:** MEDIUM
+
+The hook transforms and combines data from two sources:
+
+```typescript
+const {evaluatorConfigs} = useFetchEvaluatorsData()
+// Combines with evaluator templates for display
+```
+
+**Impact:**
+- Table columns expect `evaluator_key` property
+- Tag cells, type pills depend on config shape
+- Filtering/search operates on legacy property names
+
+**Mitigation:**
+- Update hook to handle new `SimpleEvaluator` shape
+- Transform data at fetch boundary, keep internal shape consistent
+
+---
+
+### 5. Debug Section - Evaluator Run Coupling
+
+**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx`
+
+**Risk Level:** LOW
+
+The evaluator run uses `evaluator_key` directly:
+
+```typescript
+// Line 456
+const runResponse = await createEvaluatorRunExecution(
+    selectedEvaluator.key,  // evaluator_key
+    { inputs: outputs, settings: ... }
+)
+```
+
+**Impact:**
+- This endpoint (`/evaluators/{key}/run/`) remains unchanged
+- Uses `selectedEvaluator.key` from template, not config
+- No direct coupling to `EvaluatorConfig` shape
+
+**Mitigation:**
+- No changes needed for run functionality
+- Keep using evaluator templates for the `key` value
+
+---
+
+### 6. Global Atoms Coupling
+
+**Location:** `web/oss/src/state/evaluators/atoms.ts`
+
+**Risk Level:** MEDIUM
+
+Query atoms return legacy-shaped data:
+
+```typescript
+const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) =>
+    atomWithQuery(() => ({
+        queryKey: ['evaluator-configs', projectId],
+        queryFn: () => fetchAllEvaluatorConfigs(null, projectId),
+    }))
+)
+```
+
+**Impact:**
+- Multiple components may depend on these atoms
+- Changing shape could cascade through application
+
+**Mitigation:**
+- Update query functions to use new endpoints
+- Transform response at query boundary to maintain internal shape
+- Or update all consumers to handle new shape
+
+---
+
+### 7. Evaluator Templates vs Configs Distinction
+
+**Location:** Throughout frontend
+
+**Risk Level:** LOW
+
+The frontend distinguishes between:
+- **Evaluator templates** (`Evaluator`): Built-in evaluator definitions with `settings_template`
+- **Evaluator configs** (`EvaluatorConfig`): User-created configurations with `settings_values`
+
+**Impact:**
+- This distinction is maintained in the new system
+- Templates come from `/evaluators/` (unchanged)
+- Configs become `SimpleEvaluator` objects
+
+**Mitigation:**
+- No conceptual change needed
+- Just update config handling
+
+---
+
+## Risk Summary Table
+
+| Component | Risk Level | Complexity | Priority |
+|-----------|-----------|------------|----------|
+| Service Layer | LOW-MEDIUM | LOW | HIGH (change first) |
+| State Atoms | MEDIUM | MEDIUM | HIGH |
+| ConfigureEvaluator Form | MEDIUM | MEDIUM | MEDIUM |
+| Evaluators Registry | MEDIUM | MEDIUM | MEDIUM |
+| Debug Section | LOW | LOW | LOW |
+| Global Query Atoms | MEDIUM | LOW | MEDIUM |
+
+## Concrete Breakage Scenarios
+
+### Scenario 1: Form Submission Fails
+
+**Trigger:** Change `settings_values` to `data.parameters` without updating form
+
+**Symptoms:**
+- Form submits but settings are lost
+- Backend receives empty configuration
+- Evaluator created but doesn't work
+
+**Prevention:**
+- Transform at API boundary, not in form
+- Test form submission with real backend
+
+---
+
+### Scenario 2: Evaluator List Empty
+
+**Trigger:** Query endpoint returns new shape, UI expects old
+
+**Symptoms:**
+- Evaluators registry shows empty list
+- No error messages (data exists but unparseable)
+- Console shows undefined property access
+
+**Prevention:**
+- Update data transformation in hook
+- Add null checks and fallbacks
+- Log transformation errors
+
+---
+
+### Scenario 3: Edit Mode Fails to Load
+
+**Trigger:** `playgroundEditValuesAtom` receives `SimpleEvaluator`, expects `EvaluatorConfig`
+
+**Symptoms:**
+- Navigate to edit page, form is empty
+- Settings not populated
+- Save overwrites with empty config
+
+**Prevention:**
+- Transform at atom level
+- Test edit flow with existing configs
+
+---
+
+### Scenario 4: Delete Fails Silently
+
+**Trigger:** `DELETE` endpoint no longer exists, `POST .../archive` required
+
+**Symptoms:**
+- Click delete, no error
+- Evaluator still appears
+- Network tab shows 404/405
+
+**Prevention:**
+- Update delete function to use archive endpoint
+- Verify response handling
+
+---
+
+## Recommended Testing Strategy
+
+### Unit Tests
+- [ ] Service layer transformers (old shape ↔ new shape)
+- [ ] URI parsing (`agenta:builtin:key:v0` → `key`)
+- [ ] Slug generation from name
+
+### Integration Tests
+- [ ] Create evaluator config flow
+- [ ] Edit evaluator config flow  
+- [ ] Delete (archive) evaluator config flow
+- [ ] List/query evaluator configs flow
+
+### E2E Tests
+- [ ] Full playground flow: select template → configure → test → commit
+- [ ] Edit existing evaluator configuration
+- [ ] Clone evaluator configuration
+- [ ] Delete evaluator configuration
+
+### Regression Tests
+- [ ] Evaluator run still works
+- [ ] Batch evaluations still work (use config IDs)
+- [ ] Existing configs load correctly after migration
diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md
new file mode 100644
index 0000000000..e0f32606eb
--- /dev/null
+++ b/docs/design/migrate-evaluator-playground/status.md
@@ -0,0 +1,134 @@
+# Status: Evaluator Playground Migration
+
+## Current Phase: Research Complete
+
+**Last Updated:** 2026-01-27
+
+---
+
+## Progress Summary
+
+### Completed
+
+- [x] Map current Evaluator Playground implementation
+  - Identified all frontend components
+  - Documented state management (atoms)
+  - Mapped API endpoints used
+  - Documented data flow
+
+- [x] Analyze PR #3527 (backend migration)
+  - Understood new `SimpleEvaluator` data model
+  - Documented new endpoint shapes
+  - Identified backward compatibility layer
+
+- [x] Investigate native evaluator execution path
+  - Confirmed `/evaluators/{key}/run` now resolves `agenta:builtin:{key}:v0` via SDK handler registry
+  - Confirmed native workflow execution endpoint exists: `POST /preview/workflows/invoke`
+  - Documented request structure used by batch evaluation tasks
+
+- [x] Compare old vs new endpoints
+  - Documented request/response differences
+  - Identified URI-based evaluator identification
+  - Noted response wrapper changes
+
+- [x] Identify coupling and risk areas
+  - State management coupling (MEDIUM risk)
+  - Form initialization coupling (MEDIUM risk)
+  - Service layer coupling (LOW-MEDIUM risk)
+  - Created risk mitigation strategies
+
+- [x] Propose migration plan
+  - Adapter pattern approach
+  - Feature flag integration
+  - Phased rollout strategy
+
+### In Progress
+
+- [ ] Phase 1: Foundation - Not started
+
+### Blocked
+
+- [ ] Phase 3: Integration Testing - Blocked on PR #3527 merge
+
+---
+
+## Key Findings
+
+### 1. The `/evaluators/{key}/run/` endpoint works but is now a wrapper
+
+**Important Discovery:** PR #3527 refactored the legacy run endpoint to use the native handler registry internally:
+- It builds a URI from the evaluator_key: `agenta:builtin:{key}:v0`
+- Uses `retrieve_handler(uri)` to get the actual handler function
+- Directly invokes the handler
+
+**Implication:** The external interface is unchanged, but internally it uses the new architecture.
+
+### 2. Native workflow invoke path exists
+
+There's a fully native way to run evaluators:
+- Endpoint: `POST /preview/workflows/invoke`
+- Uses `WorkflowServiceRequest` with URI in revision data
+- Same mechanism used by batch evaluations
+
+**Recommendation:** Keep using legacy endpoint for now (simpler), consider native invoke for future custom evaluator support.
+
+### 3. URI-based handler registry
+
+The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions:
+- Format: `agenta:builtin:{evaluator_key}:v0`
+- Supports custom evaluators: `user:custom:my_eval:latest`
+- Enables version management of evaluator implementations
+
+### 4. Adapter pattern minimizes risk
+
+By transforming data at the API boundary, we can:
+- Keep internal data shapes unchanged
+- Minimize code changes
+- Enable easy rollback via feature flag
+
+### 5. Output schema handling
+
+The new `SimpleEvaluator` model includes explicit output schemas. The backend migration generates these from evaluator settings. For new configs:
+- Built-in evaluators: Schema can be derived from evaluator type
+- Custom evaluators: Schema should be provided by user
+
+---
+
+## Decisions Made
+
+| Decision | Rationale | Date |
+|----------|-----------|------|
+| Use adapter pattern | Minimizes changes to internal code, enables gradual migration | 2026-01-27 |
+| Feature flag approach | Allows gradual rollout and easy rollback | 2026-01-27 |
+| Keep form structure as `settings_values` | Avoid cascading changes to form components | 2026-01-27 |
+
+---
+
+## Open Questions
+
+1. **Run migration target:** For full migration, do we want the playground to invoke by:
+   - built-in key -> URI (`agenta:builtin:{key}:v0`), or
+   - evaluator revision URI stored on `SimpleEvaluator.data.uri` (preferred), or
+   - a specific evaluator revision id (even more explicit)?
+2. **Output Schema:** Confirm whether frontend must provide `data.schemas.outputs` on create/edit, or backend will derive defaults.
+3. **Slug Generation:** Client-side or server-side?
+
+---
+
+## Next Steps
+
+1. Wait for PR #3527 to be merged
+2. Start Phase 1: Create type definitions and adapters
+3. Add feature flag infrastructure
+4. Test with new endpoints
+
+---
+
+## Related Links
+
+- [PR #3527: Migrate evaluators but keep legacy endpoints](https://github.com/Agenta-AI/agenta/pull/3527)
+- [context.md](./context.md) - Background and goals
+- [current-system.md](./current-system.md) - Current implementation details
+- [new-endpoints.md](./new-endpoints.md) - New endpoint documentation
+- [risk-analysis.md](./risk-analysis.md) - Coupling and risk analysis
+- [plan.md](./plan.md) - Migration execution plan

From df1e622917e48c9d5731890c95dee6355150472e Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 27 Jan 2026 19:06:38 +0100
Subject: [PATCH 2/4] docs: update plan to direct migration (no adapters),
 split into PR 1 (CRUD) and PR 2 (Run)

---
 .../migrate-evaluator-playground/README.md    |  74 +-
 .../migrate-evaluator-playground/context.md   |  45 +-
 .../migration-options.md                      | 153 ++--
 .../migrate-evaluator-playground/plan.md      | 740 ++++++++++++------
 .../risk-analysis.md                          | 133 ++--
 .../migrate-evaluator-playground/status.md    |  99 +--
 6 files changed, 796 insertions(+), 448 deletions(-)

diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md
index 4197c667d1..b0b9d0c319 100644
--- a/docs/design/migrate-evaluator-playground/README.md
+++ b/docs/design/migrate-evaluator-playground/README.md
@@ -2,12 +2,23 @@
 
 ## Overview
 
-This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model, and has created backward-compatible legacy endpoints. This migration will update the frontend to use the new endpoints directly.
+This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model.
+
+## Migration Strategy
+
+**Direct migration (no adapters)** split into two PRs:
+
+| PR | Scope | Description |
+|----|-------|-------------|
+| **PR 1** | CRUD | Migrate to `/preview/simple/evaluators/*`, change internal types to `SimpleEvaluator` |
+| **PR 2** | Run | Migrate to `/preview/workflows/invoke`, add workflow service types |
+
+See [plan.md](./plan.md) for detailed implementation steps.
 
 ## Context
 
-- **PR #3527**: Backend migration that introduces new evaluator endpoints while keeping legacy endpoints for backward compatibility
-- **Goal**: Migrate the Evaluator Playground frontend to use new endpoints, improving consistency with the new workflow-based architecture
+- **PR #3527**: Backend migration that introduces new evaluator endpoints
+- **Goal**: Full migration to new endpoints, no legacy code remaining
 
 ## Documents
 
@@ -17,24 +28,57 @@ This planning workspace documents the migration of the Evaluator Playground fron
 | [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation |
 | [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy |
 | [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers |
-| [migration-options.md](./migration-options.md) | Migration plan options: direct vs transitional approaches |
+| [migration-options.md](./migration-options.md) | Why we chose direct migration over adapters |
 | [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration |
-| [plan.md](./plan.md) | Migration execution plan with phases and milestones |
+| [plan.md](./plan.md) | **Main plan** - PR 1 (CRUD) and PR 2 (Run) implementation details |
 | [status.md](./status.md) | Living document for progress updates and decisions |
 
-## Key Files Affected
+## Key Mapping Changes
+
+| Legacy | New |
+|--------|-----|
+| `EvaluatorConfig` | `SimpleEvaluator` |
+| `evaluator_key` | derived from `data.uri` |
+| `settings_values` | `data.parameters` |
+| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` |
+| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` |
+| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` |
+| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` |
+| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` |
 
-### Frontend - Core Components
-- `web/oss/src/components/Evaluators/` - Evaluators registry
-- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` - Playground UI
-- `web/oss/src/services/evaluators/index.ts` - API service layer
-- `web/oss/src/services/evaluations/api_ee/index.ts` - Evaluator run execution
+## Files Affected
 
-### Frontend - State Management
-- `web/oss/src/state/evaluators/atoms.ts` - Evaluator query atoms
-- `web/oss/src/lib/atoms/evaluation.ts` - Legacy evaluation atoms
+### PR 1: CRUD Migration
+
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` |
+| Services | `web/oss/src/services/evaluators/index.ts` |
+| State | `web/oss/src/state/evaluators/atoms.ts` |
+| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` |
+| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` |
+| Registry | `web/oss/src/components/Evaluators/index.tsx` |
+| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` |
+| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` |
+
+### PR 2: Run Migration
+
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` (add workflow types) |
+| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) |
+| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` |
 
 ### Backend Reference (PR #3527)
-- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept for backward compatibility)
+- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept temporarily)
 - `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router
+- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint
 - `api/oss/src/core/evaluators/dtos.py` - New data transfer objects
+
+## Effort Estimate
+
+| PR | Effort |
+|----|--------|
+| PR 1: CRUD | 4-5 days |
+| PR 2: Run | 3-4 days |
+| **Total** | **7-9 days** |
diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md
index 4f2fdae9be..5fa82e8b21 100644
--- a/docs/design/migrate-evaluator-playground/context.md
+++ b/docs/design/migrate-evaluator-playground/context.md
@@ -24,34 +24,49 @@ The Evaluator Playground frontend currently uses legacy endpoints:
 The backend (PR #3527) has:
 1. Migrated all evaluator configs to the new workflow-based model via DB migrations
 2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/`
-3. Kept legacy endpoints as thin wrappers that convert new model back to legacy format
+3. Native workflow execution available at `/preview/workflows/invoke`
+4. Kept legacy endpoints as thin wrappers (to be deprecated)
 
 **The frontend needs to migrate to use the new endpoints directly.**
 
 ## Goals
 
 1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints
-2. **Update data models** in frontend to match new `SimpleEvaluator` shape
-3. **Maintain backward compatibility** during transition (feature flag or gradual rollout)
-4. **Keep the evaluator run endpoint** (`/evaluators/{key}/run/`) - this remains unchanged
-5. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality
+2. **Replace legacy evaluator run** with native workflow invoke (`/preview/workflows/invoke`)
+3. **Update data models** in frontend to match new `SimpleEvaluator` shape (no adapters)
+4. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality
+5. **Remove all legacy endpoint usage** - clean migration, no dual-path code
 
 ## Non-Goals
 
-1. **Not migrating the evaluator run endpoint** - The `/evaluators/{key}/run/` endpoint is still used and works the same way
-2. **Not changing the Evaluator Playground UI** - Only the data layer changes
-3. **Not migrating evaluation batch runs** - Those use evaluator revision IDs which are handled by the backend migration
-4. **Not introducing new evaluator features** - This is a pure backend migration
+1. **Not changing the Evaluator Playground UI** - Only the data layer changes
+2. **Not migrating evaluation batch runs** - Those already use the new workflow system internally
+3. **Not introducing new evaluator features** - This is a pure endpoint migration
 
 ## Success Criteria
 
-1. Evaluator Playground can create, edit, delete evaluators using new endpoints
-2. All existing evaluator configurations continue to work
-3. No regression in evaluator testing functionality
-4. Clean removal of legacy endpoint usage in frontend
+1. Evaluator Playground can create, edit, delete evaluators using new `SimpleEvaluator` endpoints
+2. Evaluator Playground can run evaluators using native workflow invoke
+3. All existing evaluator configurations continue to work
+4. No regression in evaluator testing functionality
+5. No legacy endpoint calls remain in frontend code
 
 ## Constraints
 
 1. Must not break existing evaluator configurations
-2. Must coordinate with backend team on endpoint availability
-3. Should be deployable incrementally (not big-bang)
+2. Must coordinate with backend team on endpoint availability (PR #3527)
+3. Split into two PRs for reviewability (CRUD first, then Run)
+
+## Migration Approach
+
+**Direct migration (no adapters):**
+
+| PR | Scope | Endpoints |
+|----|-------|-----------|
+| PR 1 | CRUD | `/preview/simple/evaluators/*` |
+| PR 2 | Run | `/preview/workflows/invoke` |
+
+This approach:
+- Avoids tech debt from adapter layers
+- Aligns internal types with backend models
+- Keeps changes reviewable by splitting into two PRs
diff --git a/docs/design/migrate-evaluator-playground/migration-options.md b/docs/design/migrate-evaluator-playground/migration-options.md
index 6ea44db0de..40bf6b4caa 100644
--- a/docs/design/migrate-evaluator-playground/migration-options.md
+++ b/docs/design/migrate-evaluator-playground/migration-options.md
@@ -1,125 +1,106 @@
-# Migration Options (Plan A vs Plan B)
+# Migration Options
 
 ## Goal
 
 Full migration of the Evaluator Playground to the new workflow-based evaluator APIs, including:
-- CRUD on evaluator configs via `/preview/simple/evaluators/*` (or the richer `/preview/evaluators/*` family)
+- CRUD on evaluator configs via `/preview/simple/evaluators/*`
 - Running evaluators via native workflow invocation (`/preview/workflows/invoke`) instead of the legacy `/evaluators/{key}/run`
 
-This doc lists two concrete migration strategies.
-
 ---
 
-## Plan A (Transitional): Keep Internal Shapes Stable
+## Option A (Rejected): Adapter Pattern
 
-This is the earlier approach: keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary.
+Keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary.
 
-### Why it exists
+### Why it was considered
 
-- Minimizes touching UI/atoms/forms
+- Minimizes touching UI/atoms/forms initially
 - Lets you swap endpoints quickly with limited regression surface
 - Good when backend is still stabilizing schemas
 
-### Trade-offs
+### Why it was rejected
 
-- Adds an extra abstraction layer (adapters)
-- Can delay paying down legacy assumptions (`settings_values`, `evaluator_key`, etc.)
+- Adds tech debt (adapter layer becomes permanent)
+- Delays alignment with new architecture
+- Makes future changes harder (two mental models)
 
 ---
 
-## Plan B (Preferred): Direct Migration (No Adapters)
+## Option B (Chosen): Direct Migration
 
-This changes the frontend domain model to match the backend reality:
-- “Evaluator config” becomes `SimpleEvaluator` (workflow artifact w/ latest evaluator revision data attached)
-- Execution uses workflow invocation (`/preview/workflows/invoke`) using evaluator `data.uri`
+Change the frontend domain model to match the backend:
+- "Evaluator config" becomes `SimpleEvaluator`
+- Internal shapes use `data.parameters` instead of `settings_values`
+- Internal shapes derive `evaluator_key` from `data.uri`
 
-### Why it’s better long-term
+### Why it's better
 
-- Eliminates translation debt
-- Aligns with “evaluators are workflows” concept end-to-end
+- No translation debt
+- Aligns with "evaluators are workflows" concept end-to-end
 - Unlocks revision-aware runs and custom evaluator URIs
+- Cleaner codebase long-term
 
-### Initial Scope (not exhaustive)
-
-#### 1) Data model and type changes
-
-- Introduce TS types for `SimpleEvaluator*` and `WorkflowService*` (request/response)
-- Replace usages of `EvaluatorConfig` in the evaluator playground path with `SimpleEvaluator`
-
-Key places:
-- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts`
-- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx`
-- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts`
-
-#### 2) CRUD endpoint swap (configs)
-
-Replace:
-- `GET/POST/PUT/DELETE /evaluators/configs/*`
-
-With:
-- `POST /preview/simple/evaluators/query`
-- `POST /preview/simple/evaluators/`
-- `PUT /preview/simple/evaluators/{id}`
-- `POST /preview/simple/evaluators/{id}/archive`
-
-Key files:
-- `web/oss/src/services/evaluators/index.ts`
-- `web/oss/src/state/evaluators/atoms.ts`
-
-Notes:
-- `evaluator_key` is now derived from `simpleEvaluator.data.uri` (or carried separately in UI state)
-- Settings are now `simpleEvaluator.data.parameters`
+---
 
-#### 3) Run endpoint swap (native invoke)
+## Execution Strategy
 
-Replace:
-- `POST /evaluators/{evaluator_key}/run`
+To keep changes reviewable while avoiding adapters:
 
-With:
-- `POST /preview/workflows/invoke`
+### PR 1: CRUD Migration
+- Migrate all CRUD operations to `/preview/simple/evaluators/*`
+- Change internal types from `EvaluatorConfig` to `SimpleEvaluator`
+- Update atoms, services, and components
+- Keep legacy run endpoint temporarily
 
-What needs changing in the playground:
-- `DebugSection.tsx` currently uses `createEvaluatorRunExecution(evaluatorKey, {inputs, settings})`
-- New call should construct `WorkflowServiceRequest`:
-  - `interface.uri` (or `configuration`+`interface`) derived from evaluator `data` / built-in key
-  - `data.inputs` (merged testcase + prediction)
-  - `data.outputs` (prediction/output)
-  - `data.parameters` (settings)
+### PR 2: Run Migration
+- Migrate run from `/evaluators/{key}/run` to `/preview/workflows/invoke`
+- Add `WorkflowServiceRequest/Response` types
+- Update `DebugSection.tsx` to use native invoke
 
-Key file:
-- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx`
-- plus a new service client, e.g. `web/oss/src/services/workflows/invoke.ts`
+This sequencing:
+1. Isolates CRUD changes for easier review
+2. Allows CRUD to stabilize before changing run
+3. Avoids adapter layer entirely
+4. Results in full migration with no legacy code
 
-#### 4) Registry/list UI adjustments
+---
 
-The evaluators registry table expects legacy `evaluator_key` and `settings_values`. Under Plan B:
-- The list source becomes `SimpleEvaluator[]`
-- Table columns need to read from `data.uri` and `data.parameters`
+## Files Affected
 
-Key files:
-- `web/oss/src/components/Evaluators/index.tsx`
-- `web/oss/src/components/Evaluators/assets/getColumns.tsx`
-- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts`
+### PR 1 (CRUD)
 
-#### 5) Permissions and error handling
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` |
+| Services | `web/oss/src/services/evaluators/index.ts` |
+| State | `web/oss/src/state/evaluators/atoms.ts` |
+| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` |
+| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` |
+| Registry | `web/oss/src/components/Evaluators/index.tsx` |
+| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` |
+| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` |
 
-Native invoke uses `RUN_WORKFLOWS` permission (backend check). Expect:
-- Different 403 behavior for some users
-- Different error shape: workflow service returns `status.code/message` in response
+### PR 2 (Run)
 
-UI needs:
-- Map workflow error status to `message.error` and output editor
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` (add workflow types) |
+| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) |
+| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` |
 
 ---
 
-## Practical Recommendation
-
-If the objective is “duplicate all endpoints and fully migrate”, Plan B is the right destination.
-
-To reduce risk while still avoiding adapters, a pragmatic sequencing is:
+## Key Mapping Changes
 
-1) Migrate CRUD to SimpleEvaluator endpoints (Plan B)
-2) Keep legacy run for 1-2 PRs while CRUD stabilizes
-3) Migrate run to `/preview/workflows/invoke` (Plan B completion)
+| Legacy | New |
+|--------|-----|
+| `EvaluatorConfig` | `SimpleEvaluator` |
+| `evaluator_key` | derived from `data.uri` |
+| `settings_values` | `data.parameters` |
+| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` |
+| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` |
+| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` |
+| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` |
+| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` |
 
-This keeps changes reviewable without introducing a permanent adapter layer.
+See [plan.md](./plan.md) for detailed implementation steps.
diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md
index 35a1f9eb4a..a234ec2111 100644
--- a/docs/design/migrate-evaluator-playground/plan.md
+++ b/docs/design/migrate-evaluator-playground/plan.md
@@ -2,63 +2,67 @@
 
 ## Overview
 
-This plan outlines an incremental migration approach that minimizes risk and allows for gradual rollout. The key principle is **transform at boundaries** - keep internal data shapes stable and only change API interactions.
+Full migration of the Evaluator Playground to the new workflow-based evaluator APIs. This plan follows **Plan B (Direct Migration)** - no adapters, internal shapes change to match the new `SimpleEvaluator` model.
 
 ## Migration Strategy
 
-Two viable strategies exist:
+**Two PRs, no adapters:**
 
-- Plan A (transitional): adapter pattern, keep internal legacy `EvaluatorConfig` shape
-- Plan B (preferred destination): direct migration, internal shapes become `SimpleEvaluator` + native invoke
+1. **PR 1:** Migrate CRUD to `SimpleEvaluator` endpoints (internal shapes change)
+2. **PR 2:** Migrate run to native workflow invoke (`/preview/workflows/invoke`)
 
-This file documents Plan A as the low-risk execution plan. For the direct plan, see `docs/design/migrate-evaluator-playground/migration-options.md`.
-
-## Plan A: Adapter Pattern
-
-Instead of changing data shapes throughout the codebase, we'll:
-1. Create adapter functions at the API boundary
-2. New endpoints return `SimpleEvaluator`, adapters convert to internal `EvaluatorConfig` shape
-3. Internal components continue working unchanged
-4. Gradually update internals later (optional)
+This keeps changes reviewable while avoiding tech debt from adapter layers.
 
 ```
-┌──────────────┐     ┌──────────────┐     ┌──────────────────┐
-│  New API     │ ──► │   Adapter    │ ──► │  Internal Shape  │
-│  Endpoints   │     │   Layer      │     │  (unchanged)     │
-└──────────────┘     └──────────────┘     └──────────────────┘
+PR 1: CRUD Migration
+┌─────────────────────────────────────────────────────────────────┐
+│  EvaluatorConfig → SimpleEvaluator                              │
+│  /evaluators/configs/* → /preview/simple/evaluators/*           │
+│  settings_values → data.parameters                              │
+│  evaluator_key → data.uri                                       │
+└─────────────────────────────────────────────────────────────────┘
+
+PR 2: Run Migration  
+┌─────────────────────────────────────────────────────────────────┐
+│  /evaluators/{key}/run → /preview/workflows/invoke              │
+│  EvaluatorInputInterface → WorkflowServiceRequest               │
+└─────────────────────────────────────────────────────────────────┘
 ```
 
 ---
 
-## Phase 1: Foundation (Low Risk)
-
-**Goal:** Create adapter layer and new service functions without changing existing code
+## PR 1: CRUD Migration
 
-### Tasks
+**Goal:** Replace legacy evaluator config endpoints with new SimpleEvaluator endpoints. Change internal data model from `EvaluatorConfig` to `SimpleEvaluator`.
 
-#### 1.1 Create Type Definitions
+### Phase 1.1: Type Definitions
 
-**File:** `web/oss/src/lib/Types.ts` or new file `web/oss/src/services/evaluators/types.ts`
+**File:** `web/oss/src/lib/Types.ts` (add to existing types)
 
 ```typescript
-// New API types
-interface SimpleEvaluatorData {
+// ============ SimpleEvaluator Types ============
+
+export interface SimpleEvaluatorData {
     version?: string
-    uri?: string
-    url?: string
+    uri?: string                              // e.g., "agenta:builtin:auto_exact_match:v0"
+    url?: string                              // for webhook evaluators
     headers?: Record<string, string>
-    schemas?: { outputs?: Record<string, any> }
+    schemas?: { 
+        outputs?: Record<string, any>
+        inputs?: Record<string, any>
+        parameters?: Record<string, any>
+    }
     script?: { content: string; runtime: string }
-    parameters?: Record<string, any>
+    parameters?: Record<string, any>          // replaces settings_values
 }
 
-interface SimpleEvaluatorFlags {
+export interface SimpleEvaluatorFlags {
     is_custom?: boolean
     is_evaluator?: boolean
     is_human?: boolean
 }
 
-interface SimpleEvaluator {
+export interface SimpleEvaluator {
     id: string
     slug: string
     name?: string
@@ -71,32 +75,54 @@ interface SimpleEvaluator {
     updated_at: string
 }
 
-interface SimpleEvaluatorResponse {
+export interface SimpleEvaluatorCreate {
+    slug: string
+    name?: string
+    description?: string
+    tags?: string[]
+    flags?: SimpleEvaluatorFlags
+    data?: SimpleEvaluatorData
+}
+
+export interface SimpleEvaluatorEdit {
+    id: string
+    name?: string
+    description?: string
+    tags?: string[]
+    data?: SimpleEvaluatorData
+}
+
+export interface SimpleEvaluatorResponse {
     count: number
     evaluator: SimpleEvaluator | null
 }
 
-interface SimpleEvaluatorsResponse {
+export interface SimpleEvaluatorsResponse {
     count: number
     evaluators: SimpleEvaluator[]
 }
 ```
 
-#### 1.2 Create Adapter Functions
+**Deliverables:**
+- [ ] Add `SimpleEvaluator*` types to Types.ts
+- [ ] Keep `EvaluatorConfig` temporarily for areas not yet migrated
+
+---
+
+### Phase 1.2: Service Layer Changes
+
+**File:** `web/oss/src/services/evaluators/index.ts`
 
-**File:** `web/oss/src/services/evaluators/adapters.ts`
+Replace legacy functions with new implementations:
 
 ```typescript
-import { EvaluatorConfig } from "@/oss/lib/Types"
-import { SimpleEvaluator, SimpleEvaluatorData } from "./types"
-import { getTagColors } from "@/oss/lib/helpers/colors"
-import { stringToNumberInRange } from "@/oss/lib/helpers/utils"
+// ============ Helper Functions ============
 
 /**
  * Extract evaluator_key from URI
  * URI format: "agenta:builtin:{key}:v0"
  */
-export function extractEvaluatorKey(uri: string | undefined): string {
+export function extractEvaluatorKeyFromUri(uri: string | undefined): string {
     if (!uri) return ""
     const parts = uri.split(":")
     if (parts.length >= 3 && parts[0] === "agenta" && parts[1] === "builtin") {
@@ -112,69 +138,23 @@ export function buildEvaluatorUri(evaluatorKey: string): string {
     return `agenta:builtin:${evaluatorKey}:v0`
 }
 
-/**
- * Convert SimpleEvaluator to internal EvaluatorConfig shape
- */
-export function simpleEvaluatorToConfig(
-    simple: SimpleEvaluator,
-    projectId?: string
-): EvaluatorConfig {
-    const tagColors = getTagColors()
-    const evaluatorKey = extractEvaluatorKey(simple.data?.uri)
-    
-    return {
-        id: simple.id,
-        name: simple.name || "",
-        evaluator_key: evaluatorKey,
-        settings_values: simple.data?.parameters || {},
-        created_at: simple.created_at,
-        updated_at: simple.updated_at,
-        // Frontend additions
-        color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)],
-        tags: simple.tags,
-    }
-}
-
-/**
- * Convert internal EvaluatorConfig to SimpleEvaluator create payload
- */
-export function configToSimpleEvaluatorCreate(
-    config: Omit<EvaluatorConfig, "id" | "created_at">,
-    outputsSchema?: Record<string, any>
-): SimpleEvaluatorCreate {
-    return {
-        slug: generateSlug(config.name),
-        name: config.name,
-        flags: { is_evaluator: true },
-        data: {
-            uri: buildEvaluatorUri(config.evaluator_key),
-            parameters: config.settings_values,
-            schemas: outputsSchema ? { outputs: outputsSchema } : undefined,
-        },
-    }
-}
-
 /**
  * Generate slug from name
  */
-function generateSlug(name: string): string {
+export function generateSlug(name: string): string {
     return name
         .toLowerCase()
         .replace(/[^a-z0-9]+/g, "-")
         .replace(/^-|-$/g, "")
+        .substring(0, 50)  // limit length
 }
-```
-
-#### 1.3 Create New Service Functions
 
-**File:** `web/oss/src/services/evaluators/index.ts` (add to existing)
+// ============ CRUD Functions ============
 
-```typescript
-// === NEW ENDPOINT FUNCTIONS ===
-
-export const fetchAllEvaluatorConfigsV2 = async (
+export const fetchAllEvaluatorConfigs = async (
+    _appId?: string | null,  // kept for backward compat, ignored
     projectIdOverride?: string | null,
-): Promise<EvaluatorConfig[]> => {
+): Promise<SimpleEvaluator[]> => {
     const {projectId: projectIdFromStore} = getProjectValues()
     const projectId = projectIdOverride ?? projectIdFromStore
 
@@ -182,269 +162,561 @@ export const fetchAllEvaluatorConfigsV2 = async (
 
     const response = await axios.post(
         `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`,
-        { flags: { is_evaluator: true } }
+        { evaluator: { flags: { is_evaluator: true } } }
     )
     
-    const evaluators = response.data?.evaluators || []
-    return evaluators.map((e: SimpleEvaluator) => simpleEvaluatorToConfig(e, projectId))
+    return response.data?.evaluators || []
 }
 
-export const createEvaluatorConfigV2 = async (
-    config: CreateEvaluationConfigData,
-): Promise<EvaluatorConfig> => {
+export const createEvaluatorConfig = async (
+    evaluatorKey: string,
+    name: string,
+    settingsValues: Record<string, any>,
+): Promise<SimpleEvaluator> => {
     const {projectId} = getProjectValues()
     
-    const payload = configToSimpleEvaluatorCreate(config)
+    const payload: SimpleEvaluatorCreate = {
+        slug: generateSlug(name),
+        name,
+        flags: { is_evaluator: true },
+        data: {
+            uri: buildEvaluatorUri(evaluatorKey),
+            parameters: settingsValues,
+        },
+    }
     
     const response = await axios.post(
         `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`,
-        payload,
+        { evaluator: payload },
     )
     
-    const simple = response.data?.evaluator
-    if (!simple) throw new Error("Failed to create evaluator")
+    const result = response.data?.evaluator
+    if (!result) throw new Error("Failed to create evaluator")
     
-    return simpleEvaluatorToConfig(simple, projectId)
+    return result
 }
 
-export const updateEvaluatorConfigV2 = async (
-    configId: string,
-    config: Partial<CreateEvaluationConfigData>,
-): Promise<EvaluatorConfig> => {
+export const updateEvaluatorConfig = async (
+    evaluatorId: string,
+    updates: { name?: string; settingsValues?: Record<string, any> },
+): Promise<SimpleEvaluator> => {
     const {projectId} = getProjectValues()
 
     const payload: SimpleEvaluatorEdit = {
-        id: configId,
-        name: config.name,
-        data: config.settings_values 
-            ? { parameters: config.settings_values }
+        id: evaluatorId,
+        name: updates.name,
+        data: updates.settingsValues 
+            ? { parameters: updates.settingsValues }
             : undefined,
     }
 
     const response = await axios.put(
-        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`,
-        payload,
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`,
+        { evaluator: payload },
     )
     
-    const simple = response.data?.evaluator
-    if (!simple) throw new Error("Failed to update evaluator")
+    const result = response.data?.evaluator
+    if (!result) throw new Error("Failed to update evaluator")
     
-    return simpleEvaluatorToConfig(simple, projectId)
+    return result
 }
 
-export const deleteEvaluatorConfigV2 = async (configId: string): Promise<boolean> => {
+export const deleteEvaluatorConfig = async (evaluatorId: string): Promise<boolean> => {
     const {projectId} = getProjectValues()
 
     await axios.post(
-        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`,
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}/archive?project_id=${projectId}`,
     )
     
     return true
 }
+
+export const fetchEvaluatorById = async (evaluatorId: string): Promise<SimpleEvaluator | null> => {
+    const {projectId} = getProjectValues()
+
+    const response = await axios.get(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`,
+    )
+    
+    return response.data?.evaluator || null
+}
 ```
 
 **Deliverables:**
-- [ ] Type definitions for new API shapes
-- [ ] Adapter functions (both directions)
-- [ ] New service functions with V2 suffix
-- [ ] Unit tests for adapters
-
-**Estimated Effort:** 1-2 days
+- [ ] Replace `fetchAllEvaluatorConfigs` implementation
+- [ ] Replace `createEvaluatorConfig` implementation
+- [ ] Replace `updateEvaluatorConfig` implementation
+- [ ] Replace `deleteEvaluatorConfig` implementation
+- [ ] Add helper functions for URI handling
+- [ ] Remove legacy endpoint calls
 
 ---
 
-## Phase 2: Feature Flag Integration (Low Risk)
-
-**Goal:** Add feature flag to toggle between old and new endpoints
+### Phase 1.3: State/Atoms Changes
 
-### Tasks
+**File:** `web/oss/src/state/evaluators/atoms.ts`
 
-#### 2.1 Add Feature Flag
-
-**File:** `web/oss/src/lib/helpers/featureFlags.ts` or environment config
+Update query atoms to return `SimpleEvaluator[]`:
 
 ```typescript
-export const USE_NEW_EVALUATOR_ENDPOINTS = 
-    process.env.NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS === "true"
+export const evaluatorConfigsQueryAtomFamily = atomFamily((projectId: string | null) =>
+    atomWithQuery(() => ({
+        queryKey: ["evaluator-configs", projectId],
+        queryFn: () => fetchAllEvaluatorConfigs(null, projectId),
+        enabled: !!projectId,
+    }))
+)
+
+// Derived atom for non-archived evaluators
+export const nonArchivedEvaluatorsAtom = atom((get) => {
+    const projectId = get(projectIdAtom)
+    if (!projectId) return []
+    
+    const query = get(evaluatorConfigsQueryAtomFamily(projectId))
+    const evaluators = query.data ?? []
+    
+    // Filter out archived (deleted_at is set)
+    return evaluators.filter((e) => !e.deleted_at)
+})
 ```
 
-#### 2.2 Create Unified Service Functions
+**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts`
 
-**File:** `web/oss/src/services/evaluators/index.ts`
+Update playground atoms to use `SimpleEvaluator`:
 
 ```typescript
-// Unified functions that use feature flag
-export const fetchAllEvaluatorConfigs = async (...args) => {
-    if (USE_NEW_EVALUATOR_ENDPOINTS) {
-        return fetchAllEvaluatorConfigsV2(...args)
-    }
-    return fetchAllEvaluatorConfigsLegacy(...args)
+// Session now stores SimpleEvaluator instead of EvaluatorConfig
+export interface PlaygroundSession {
+    evaluator: Evaluator              // template (unchanged)
+    simpleEvaluator?: SimpleEvaluator // existing config being edited
+    mode: "create" | "edit" | "clone"
 }
 
-export const createEvaluatorConfig = async (...args) => {
-    if (USE_NEW_EVALUATOR_ENDPOINTS) {
-        return createEvaluatorConfigV2(...args)
-    }
-    return createEvaluatorConfigLegacy(...args)
-}
+export const playgroundSessionAtom = atom<PlaygroundSession | null>(null)
 
-// ... same for update and delete
+// Edit values now use SimpleEvaluator shape
+export const playgroundEditValuesAtom = atom<Partial<SimpleEvaluator> | null>(null)
+
+// Derived: get evaluator_key from URI
+export const playgroundEvaluatorKeyAtom = atom((get) => {
+    const session = get(playgroundSessionAtom)
+    if (!session) return null
+    
+    // From template
+    if (session.evaluator?.key) return session.evaluator.key
+    
+    // From existing SimpleEvaluator
+    if (session.simpleEvaluator?.data?.uri) {
+        return extractEvaluatorKeyFromUri(session.simpleEvaluator.data.uri)
+    }
+    
+    return null
+})
 ```
 
 **Deliverables:**
-- [ ] Feature flag configuration
-- [ ] Unified service functions with flag branching
-- [ ] Documentation for enabling flag
-
-**Estimated Effort:** 0.5 days
+- [ ] Update `evaluatorConfigsQueryAtomFamily` return type
+- [ ] Update playground session atoms
+- [ ] Update `playgroundEditValuesAtom` shape
+- [ ] Add derived atoms for backward-compatible access (e.g., `evaluator_key`)
 
 ---
 
-## Phase 3: Integration Testing (Medium Risk)
+### Phase 1.4: Component Changes
 
-**Goal:** Verify new endpoints work correctly with existing UI
+#### ConfigureEvaluator/index.tsx
 
-### Tasks
+Key changes:
+- Form fields read/write to `data.parameters` instead of `settings_values`
+- On commit, build `SimpleEvaluatorCreate` or `SimpleEvaluatorEdit`
+- Load existing config as `SimpleEvaluator`
 
-#### 3.1 Enable Feature Flag in Development
+```typescript
+// Before
+form.setFieldsValue({
+    name: editEvalEditValues.name,
+    settings_values: editEvalEditValues.settings_values,
+})
+
+// After
+form.setFieldsValue({
+    name: simpleEvaluator.name,
+    parameters: simpleEvaluator.data?.parameters,
+})
+```
 
-- Set `NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS=true` in dev environment
-- Test all evaluator playground flows
+#### useEvaluatorsRegistryData.ts
 
-#### 3.2 Test Cases
+Update to work with `SimpleEvaluator[]`:
+
+```typescript
+// Derive evaluator_key for display
+const enrichedEvaluators = evaluators.map((e) => ({
+    ...e,
+    evaluator_key: extractEvaluatorKeyFromUri(e.data?.uri),
+    settings_values: e.data?.parameters,  // for backward compat in UI
+}))
+```
+
+#### getColumns.tsx
+
+Update column accessors:
+
+```typescript
+// Before
+dataIndex: "evaluator_key"
+
+// After  
+dataIndex: ["data", "uri"],
+render: (uri) => extractEvaluatorKeyFromUri(uri)
+```
+
+**Deliverables:**
+- [ ] Update ConfigureEvaluator form bindings
+- [ ] Update commit logic to use new service functions
+- [ ] Update useEvaluatorsRegistryData hook
+- [ ] Update table columns in getColumns.tsx
+- [ ] Update any other components that read evaluator configs
+
+---
+
+### Phase 1.5: Testing
+
+**Test Cases:**
 
 1. **List Evaluators**
    - [ ] Registry shows all existing evaluator configs
-   - [ ] Correct names, types, and icons displayed
+   - [ ] Correct names, types, icons displayed
    - [ ] Filtering and search work
+   - [ ] Archived evaluators hidden
 
 2. **Create Evaluator**
-   - [ ] Select template → Configure → Commit
-   - [ ] Settings saved correctly
-   - [ ] Redirects to edit page after create
+   - [ ] Select template → Configure → Commit works
+   - [ ] Settings (parameters) saved correctly
+   - [ ] URI generated correctly from evaluator_key
+   - [ ] Slug generated from name
 
 3. **Edit Evaluator**
-   - [ ] Load existing config
-   - [ ] Form populated with current values
-   - [ ] Update settings
+   - [ ] Load existing config into form
+   - [ ] Form populated with current values from `data.parameters`
+   - [ ] Update name and settings
    - [ ] Changes persisted
 
 4. **Delete Evaluator**
-   - [ ] Delete confirmation works
+   - [ ] Archive endpoint called
    - [ ] Evaluator removed from list
    - [ ] No errors
 
-5. **Test Evaluator**
-   - [ ] Load testcase
-   - [ ] Run variant
-   - [ ] Run evaluator
+5. **Run Evaluator (legacy endpoint - still works)**
+   - [ ] Run evaluator button works
+   - [ ] Uses evaluator_key derived from URI
    - [ ] Results displayed correctly
 
 **Deliverables:**
-- [ ] Test results document
-- [ ] Bug fixes for any issues found
-- [ ] Performance comparison (if applicable)
+- [ ] Manual test all flows
+- [ ] Fix any bugs found
+- [ ] Document any edge cases
 
-**Estimated Effort:** 2-3 days
+---
+
+### PR 1 Summary
+
+| Task | Files | Effort |
+|------|-------|--------|
+| Type definitions | `Types.ts` | 0.5 day |
+| Service layer | `services/evaluators/index.ts` | 1 day |
+| State/atoms | `state/evaluators/atoms.ts`, playground atoms | 1 day |
+| Components | ConfigureEvaluator, Registry, columns | 1-2 days |
+| Testing | Manual testing | 1 day |
+
+**Total PR 1 Effort:** 4-5 days
 
 ---
 
-## Phase 4: Gradual Rollout (Low Risk)
+## PR 2: Run Migration
 
-**Goal:** Enable new endpoints for subset of users
+**Goal:** Replace legacy `/evaluators/{key}/run` with native workflow invoke `/preview/workflows/invoke`.
 
-### Tasks
+**Prerequisite:** PR 1 merged and stable.
 
-#### 4.1 Staged Rollout
+### Phase 2.1: WorkflowService Types
 
-1. **Internal testing:** Enable for team members only
-2. **Beta users:** Enable for opt-in users
-3. **General availability:** Enable for all users
+**File:** `web/oss/src/lib/Types.ts` (add)
+
+```typescript
+// ============ Workflow Service Types ============
+
+export interface WorkflowServiceRequestData {
+    revision?: Record<string, any>
+    parameters?: Record<string, any>    // evaluator settings
+    testcase?: Record<string, any>
+    inputs?: Record<string, any>        // merged testcase data
+    trace?: Record<string, any>
+    outputs?: any                        // prediction/output
+}
 
-#### 4.2 Monitoring
+export interface WorkflowServiceInterface {
+    version?: string
+    uri?: string                         // e.g., "agenta:builtin:auto_exact_match:v0"
+    url?: string
+    headers?: Record<string, string>
+    schemas?: Record<string, any>
+}
 
-- Monitor error rates for evaluator operations
-- Track API response times
-- Watch for unexpected 404/500 errors
+export interface WorkflowServiceConfiguration {
+    script?: Record<string, any>
+    parameters?: Record<string, any>
+}
 
-**Deliverables:**
-- [ ] Rollout schedule
-- [ ] Rollback procedure documented
-- [ ] Monitoring dashboards/alerts
+export interface WorkflowServiceRequest {
+    version?: string
+    flags?: Record<string, any>
+    interface?: WorkflowServiceInterface
+    configuration?: WorkflowServiceConfiguration
+    data?: WorkflowServiceRequestData
+    references?: Record<string, any>
+    links?: Record<string, any>
+}
 
-**Estimated Effort:** 1-2 weeks (elapsed time)
+export interface WorkflowServiceStatus {
+    code?: number
+    message?: string
+    type?: string
+    stacktrace?: string | string[]
+}
+
+export interface WorkflowServiceResponseData {
+    outputs?: any
+}
+
+export interface WorkflowServiceBatchResponse {
+    version?: string
+    trace_id?: string
+    span_id?: string
+    status?: WorkflowServiceStatus
+    data?: WorkflowServiceResponseData
+}
+```
 
 ---
 
-## Phase 5: Cleanup (Low Risk)
+### Phase 2.2: Workflow Invoke Service
 
-**Goal:** Remove legacy code and feature flag
+**File:** `web/oss/src/services/workflows/invoke.ts` (new file)
 
-### Tasks
+```typescript
+import axios from "@/oss/lib/api/assets/axiosConfig"
+import { getAgentaApiUrl } from "@/oss/lib/helpers/utils"
+import { getProjectValues } from "@/oss/contexts/project.context"
+import {
+    WorkflowServiceRequest,
+    WorkflowServiceBatchResponse,
+    SimpleEvaluator,
+} from "@/oss/lib/Types"
+
+export interface InvokeEvaluatorParams {
+    evaluator: SimpleEvaluator
+    inputs: Record<string, any>        // testcase data + any extra inputs
+    outputs: any                        // prediction/output from variant
+    parameters?: Record<string, any>   // override settings (optional)
+}
 
-#### 5.1 Remove Legacy Functions
+/**
+ * Invoke an evaluator using native workflow service
+ */
+export const invokeEvaluator = async (
+    params: InvokeEvaluatorParams
+): Promise<WorkflowServiceBatchResponse> => {
+    const { projectId } = getProjectValues()
+    const { evaluator, inputs, outputs, parameters } = params
+
+    const uri = evaluator.data?.uri
+    if (!uri) {
+        throw new Error("Evaluator has no URI configured")
+    }
 
-- Remove `fetchAllEvaluatorConfigsLegacy`
-- Remove `createEvaluatorConfigLegacy`
-- Remove `updateEvaluatorConfigLegacy`
-- Remove `deleteEvaluatorConfigLegacy`
+    const request: WorkflowServiceRequest = {
+        version: "2025.07.14",
+        interface: {
+            uri,
+        },
+        configuration: {
+            parameters: parameters ?? evaluator.data?.parameters,
+        },
+        data: {
+            inputs,
+            outputs,
+            parameters: parameters ?? evaluator.data?.parameters,
+        },
+    }
 
-#### 5.2 Remove Feature Flag
+    const response = await axios.post<WorkflowServiceBatchResponse>(
+        `${getAgentaApiUrl()}/preview/workflows/invoke?project_id=${projectId}`,
+        request,
+    )
 
-- Remove feature flag checks
-- Clean up V2 suffix from function names
+    return response.data
+}
 
-#### 5.3 Update Documentation
+/**
+ * Map workflow response to evaluator output format
+ */
+export function mapWorkflowResponseToEvaluatorOutput(
+    response: WorkflowServiceBatchResponse
+): { outputs: Record<string, any> } {
+    if (response.status?.code && response.status.code >= 400) {
+        throw new Error(response.status.message || "Evaluator execution failed")
+    }
 
-- Update API documentation
-- Update developer docs
+    return {
+        outputs: response.data?.outputs ?? {},
+    }
+}
+```
+
+---
+
+### Phase 2.3: Update DebugSection
+
+**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx`
+
+Replace `createEvaluatorRunExecution` with `invokeEvaluator`:
+
+```typescript
+// Before
+const runResponse = await createEvaluatorRunExecution(
+    selectedEvaluator.key,
+    {
+        inputs: outputs,
+        settings: formValues.settings_values,
+    }
+)
+
+// After
+import { invokeEvaluator, mapWorkflowResponseToEvaluatorOutput } from "@/oss/services/workflows/invoke"
+
+const workflowResponse = await invokeEvaluator({
+    evaluator: simpleEvaluator,  // from playground state
+    inputs: {
+        ...testcaseData,
+        prediction: variantOutput,
+    },
+    outputs: variantOutput,
+    parameters: formValues.parameters,  // current form settings
+})
+
+const runResponse = mapWorkflowResponseToEvaluatorOutput(workflowResponse)
+```
+
+**Error Handling:**
+
+```typescript
+try {
+    const workflowResponse = await invokeEvaluator(...)
+    
+    // Check for workflow-level errors
+    if (workflowResponse.status?.code && workflowResponse.status.code >= 400) {
+        message.error(workflowResponse.status.message || "Evaluator failed")
+        return
+    }
+    
+    const result = mapWorkflowResponseToEvaluatorOutput(workflowResponse)
+    setEvaluatorResult(result.outputs)
+    
+} catch (error) {
+    message.error(getErrorMessage(error))
+}
+```
+
+---
+
+### Phase 2.4: Update Evaluations Service (if needed)
+
+If other parts of the app use `createEvaluatorRunExecution`, update them too:
+
+**File:** `web/oss/src/services/evaluations/api_ee/index.ts`
+
+- Keep `createEvaluatorRunExecution` for now (batch evaluations may still use it via backend)
+- Or deprecate and point to new invoke
+
+---
+
+### Phase 2.5: Testing
+
+**Test Cases:**
+
+1. **Run Evaluator in Playground**
+   - [ ] Click "Run Evaluator" with testcase loaded
+   - [ ] Native invoke endpoint called
+   - [ ] Results displayed correctly
+   - [ ] Errors handled gracefully
+
+2. **Different Evaluator Types**
+   - [ ] Test exact_match evaluator
+   - [ ] Test regex evaluator
+   - [ ] Test AI critique evaluator (LLM-based)
+   - [ ] Test custom code evaluator
+
+3. **Error Scenarios**
+   - [ ] Invalid evaluator (no URI)
+   - [ ] Missing inputs
+   - [ ] Evaluator execution error
+   - [ ] Network error
+
+4. **Permissions**
+   - [ ] User with RUN_WORKFLOWS permission can run
+   - [ ] User without permission gets appropriate error
 
 **Deliverables:**
-- [ ] Legacy code removed
-- [ ] Feature flag removed
-- [ ] Documentation updated
-- [ ] PR for cleanup
+- [ ] Manual test all evaluator types
+- [ ] Fix any bugs found
+- [ ] Verify error messages are user-friendly
 
-**Estimated Effort:** 1 day
+---
+
+### PR 2 Summary
+
+| Task | Files | Effort |
+|------|-------|--------|
+| Workflow types | `Types.ts` | 0.5 day |
+| Invoke service | `services/workflows/invoke.ts` | 0.5 day |
+| DebugSection update | `DebugSection.tsx` | 1 day |
+| Error handling | Various | 0.5 day |
+| Testing | Manual testing | 1 day |
+
+**Total PR 2 Effort:** 3-4 days
 
 ---
 
 ## Timeline Summary
 
-| Phase | Duration | Risk | Dependencies |
-|-------|----------|------|--------------|
-| Phase 1: Foundation | 1-2 days | Low | None |
-| Phase 2: Feature Flag | 0.5 days | Low | Phase 1 |
-| Phase 3: Integration Testing | 2-3 days | Medium | Phase 2, Backend PR merged |
-| Phase 4: Gradual Rollout | 1-2 weeks | Low | Phase 3 |
-| Phase 5: Cleanup | 1 day | Low | Phase 4 complete |
+| PR | Tasks | Effort | Dependencies |
+|----|-------|--------|--------------|
+| PR 1: CRUD Migration | Types, services, atoms, components | 4-5 days | Backend PR #3527 merged |
+| PR 2: Run Migration | Workflow types, invoke service, DebugSection | 3-4 days | PR 1 merged and stable |
 
-**Total Implementation Time:** ~5-7 days
-**Total Rollout Time:** ~2-3 weeks
+**Total Implementation:** 7-9 days
 
 ---
 
 ## Rollback Plan
 
-If issues are discovered after deployment:
+### PR 1 Rollback
+- Revert PR 1 commit
+- Legacy endpoints still exist on backend for a period
 
-1. **Immediate:** Set feature flag to `false`
-2. **Short-term:** Deploy hotfix to disable new endpoints
-3. **Investigation:** Analyze issues with new endpoints
-4. **Resolution:** Fix and re-test before re-enabling
+### PR 2 Rollback
+- Revert PR 2 commit
+- Fall back to legacy `/evaluators/{key}/run` (still supported)
 
 ---
 
 ## Open Questions
 
-1. **Output Schema Generation:** Should the frontend generate output schemas when creating evaluators, or should the backend handle this?
-   - Current PR shows backend generates schemas during migration
-   - Frontend may need to include schema for new configs
+1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix?
+
+2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type?
 
-2. **Slug Generation:** Should slugs be generated client-side or server-side?
-   - Server-side is safer (uniqueness checks)
-   - Client-side is faster (no round-trip)
+3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`?
 
-3. **Error Handling:** How should the frontend handle validation errors from new endpoints?
-   - New endpoints may return different error shapes
-   - Need to map to user-friendly messages
+4. **Trace linking:** Should the playground display trace_id from workflow response for debugging?
diff --git a/docs/design/migrate-evaluator-playground/risk-analysis.md b/docs/design/migrate-evaluator-playground/risk-analysis.md
index 0bd037f0a0..3c522d441a 100644
--- a/docs/design/migrate-evaluator-playground/risk-analysis.md
+++ b/docs/design/migrate-evaluator-playground/risk-analysis.md
@@ -25,9 +25,10 @@ interface EvaluatorConfig {
 - `playgroundEditValuesAtom` is read throughout ConfigureEvaluator and DebugSection
 - Form initialization relies on `settings_values` property name
 
-**Mitigation:**
-- Create adapter functions to convert between `SimpleEvaluator` and internal state
-- Or update atoms to use `SimpleEvaluator` shape and update all consumers
+**Mitigation (PR 1):**
+- Update atoms to use `SimpleEvaluator` shape directly
+- Add derived atoms for backward-compatible access (e.g., `evaluator_key` from URI)
+- Update all atom consumers in ConfigureEvaluator and DebugSection
 
 ---
 
@@ -53,9 +54,10 @@ if (editMode && editEvalEditValues) {
 - Changing to `data.parameters` would break form binding
 - DynamicFormField components use `["settings_values", field.key]` name paths
 
-**Mitigation:**
-- Keep internal form structure as `settings_values` 
-- Transform on API boundary (adapter pattern)
+**Mitigation (PR 1):**
+- Update form field names from `settings_values` to `parameters`
+- Update DynamicFormField name paths
+- Update form.getFieldsValue() to extract `parameters`
 
 ---
 
@@ -81,10 +83,10 @@ return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, conf
 - Need to update URLs and payload transformation
 - Response handling needs to unwrap `{ evaluator: ... }` wrapper
 
-**Mitigation:**
-- Create new service functions for new endpoints
-- Keep old functions temporarily for gradual migration
-- Add response/request transformers
+**Mitigation (PR 1):**
+- Replace all service functions with new implementations
+- New functions build `SimpleEvaluator` payloads directly
+- Handle response wrapper in service layer
 
 ---
 
@@ -106,9 +108,10 @@ const {evaluatorConfigs} = useFetchEvaluatorsData()
 - Tag cells, type pills depend on config shape
 - Filtering/search operates on legacy property names
 
-**Mitigation:**
-- Update hook to handle new `SimpleEvaluator` shape
-- Transform data at fetch boundary, keep internal shape consistent
+**Mitigation (PR 1):**
+- Update hook to work with `SimpleEvaluator[]`
+- Derive `evaluator_key` from `data.uri` for display
+- Update column accessors in getColumns.tsx
 
 ---
 
@@ -116,12 +119,11 @@ const {evaluatorConfigs} = useFetchEvaluatorsData()
 
 **Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx`
 
-**Risk Level:** LOW
+**Risk Level:** MEDIUM (PR 2)
 
-The evaluator run uses `evaluator_key` directly:
+The evaluator run uses legacy endpoint:
 
 ```typescript
-// Line 456
 const runResponse = await createEvaluatorRunExecution(
     selectedEvaluator.key,  // evaluator_key
     { inputs: outputs, settings: ... }
@@ -129,13 +131,14 @@ const runResponse = await createEvaluatorRunExecution(
 ```
 
 **Impact:**
-- This endpoint (`/evaluators/{key}/run/`) remains unchanged
-- Uses `selectedEvaluator.key` from template, not config
-- No direct coupling to `EvaluatorConfig` shape
+- Must migrate to `/preview/workflows/invoke`
+- Need to construct `WorkflowServiceRequest`
+- Different error handling (workflow status vs HTTP errors)
 
-**Mitigation:**
-- No changes needed for run functionality
-- Keep using evaluator templates for the `key` value
+**Mitigation (PR 2):**
+- Create new `invokeEvaluator()` service function
+- Build `WorkflowServiceRequest` with URI from `SimpleEvaluator.data.uri`
+- Map workflow response/errors to UI
 
 ---
 
@@ -160,10 +163,10 @@ const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) =>
 - Multiple components may depend on these atoms
 - Changing shape could cascade through application
 
-**Mitigation:**
-- Update query functions to use new endpoints
-- Transform response at query boundary to maintain internal shape
-- Or update all consumers to handle new shape
+**Mitigation (PR 1):**
+- Update service function to return `SimpleEvaluator[]`
+- Update all consumers to handle new shape
+- Change in one place (service), ripple through atoms automatically
 
 ---
 
@@ -175,7 +178,7 @@ const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) =>
 
 The frontend distinguishes between:
 - **Evaluator templates** (`Evaluator`): Built-in evaluator definitions with `settings_template`
-- **Evaluator configs** (`EvaluatorConfig`): User-created configurations with `settings_values`
+- **Evaluator configs** (`SimpleEvaluator`): User-created configurations with `data.parameters`
 
 **Impact:**
 - This distinction is maintained in the new system
@@ -184,26 +187,27 @@ The frontend distinguishes between:
 
 **Mitigation:**
 - No conceptual change needed
+- Templates API unchanged
 - Just update config handling
 
 ---
 
 ## Risk Summary Table
 
-| Component | Risk Level | Complexity | Priority |
-|-----------|-----------|------------|----------|
-| Service Layer | LOW-MEDIUM | LOW | HIGH (change first) |
-| State Atoms | MEDIUM | MEDIUM | HIGH |
-| ConfigureEvaluator Form | MEDIUM | MEDIUM | MEDIUM |
-| Evaluators Registry | MEDIUM | MEDIUM | MEDIUM |
-| Debug Section | LOW | LOW | LOW |
-| Global Query Atoms | MEDIUM | LOW | MEDIUM |
+| Component | Risk Level | PR | Priority |
+|-----------|-----------|-----|----------|
+| Service Layer | LOW-MEDIUM | PR 1 | HIGH (change first) |
+| State Atoms | MEDIUM | PR 1 | HIGH |
+| ConfigureEvaluator Form | MEDIUM | PR 1 | MEDIUM |
+| Evaluators Registry | MEDIUM | PR 1 | MEDIUM |
+| Global Query Atoms | MEDIUM | PR 1 | MEDIUM |
+| Debug Section (Run) | MEDIUM | PR 2 | MEDIUM |
 
 ## Concrete Breakage Scenarios
 
 ### Scenario 1: Form Submission Fails
 
-**Trigger:** Change `settings_values` to `data.parameters` without updating form
+**Trigger:** Form still uses `settings_values` but service expects `parameters`
 
 **Symptoms:**
 - Form submits but settings are lost
@@ -211,14 +215,15 @@ The frontend distinguishes between:
 - Evaluator created but doesn't work
 
 **Prevention:**
-- Transform at API boundary, not in form
+- Update form field names to `parameters`
 - Test form submission with real backend
+- Verify payload in network tab
 
 ---
 
 ### Scenario 2: Evaluator List Empty
 
-**Trigger:** Query endpoint returns new shape, UI expects old
+**Trigger:** Query endpoint returns `SimpleEvaluator[]`, UI expects `EvaluatorConfig[]`
 
 **Symptoms:**
 - Evaluators registry shows empty list
@@ -226,15 +231,15 @@ The frontend distinguishes between:
 - Console shows undefined property access
 
 **Prevention:**
-- Update data transformation in hook
-- Add null checks and fallbacks
+- Update all components to use `SimpleEvaluator` shape
+- Add null checks for `data?.uri`, `data?.parameters`
 - Log transformation errors
 
 ---
 
 ### Scenario 3: Edit Mode Fails to Load
 
-**Trigger:** `playgroundEditValuesAtom` receives `SimpleEvaluator`, expects `EvaluatorConfig`
+**Trigger:** Component expects `settings_values`, receives `data.parameters`
 
 **Symptoms:**
 - Navigate to edit page, form is empty
@@ -242,7 +247,7 @@ The frontend distinguishes between:
 - Save overwrites with empty config
 
 **Prevention:**
-- Transform at atom level
+- Update form initialization to read from `data.parameters`
 - Test edit flow with existing configs
 
 ---
@@ -262,26 +267,54 @@ The frontend distinguishes between:
 
 ---
 
+### Scenario 5: Evaluator Run Fails (PR 2)
+
+**Trigger:** Workflow invoke returns different response shape
+
+**Symptoms:**
+- Run button shows error
+- Results not displayed
+- Console shows parsing errors
+
+**Prevention:**
+- Map `WorkflowServiceBatchResponse` to expected output format
+- Handle `status.code` errors from workflow response
+- Test with all evaluator types
+
+---
+
 ## Recommended Testing Strategy
 
-### Unit Tests
-- [ ] Service layer transformers (old shape ↔ new shape)
+### PR 1 Testing
+
+**Unit Tests:**
 - [ ] URI parsing (`agenta:builtin:key:v0` → `key`)
 - [ ] Slug generation from name
+- [ ] Service function request/response handling
 
-### Integration Tests
+**Integration Tests:**
 - [ ] Create evaluator config flow
 - [ ] Edit evaluator config flow  
 - [ ] Delete (archive) evaluator config flow
 - [ ] List/query evaluator configs flow
 
-### E2E Tests
+**E2E Tests:**
 - [ ] Full playground flow: select template → configure → test → commit
 - [ ] Edit existing evaluator configuration
 - [ ] Clone evaluator configuration
 - [ ] Delete evaluator configuration
 
-### Regression Tests
-- [ ] Evaluator run still works
-- [ ] Batch evaluations still work (use config IDs)
-- [ ] Existing configs load correctly after migration
+### PR 2 Testing
+
+**Unit Tests:**
+- [ ] `WorkflowServiceRequest` construction
+- [ ] Response mapping to evaluator output format
+- [ ] Error status handling
+
+**Integration Tests:**
+- [ ] Run evaluator with different types (exact_match, regex, AI critique)
+- [ ] Error scenarios (invalid inputs, missing outputs)
+
+**Regression Tests:**
+- [ ] Existing configs load correctly
+- [ ] Batch evaluations still work (they use backend workflow invoke)
diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md
index e0f32606eb..b566579b5d 100644
--- a/docs/design/migrate-evaluator-playground/status.md
+++ b/docs/design/migrate-evaluator-playground/status.md
@@ -1,11 +1,22 @@
 # Status: Evaluator Playground Migration
 
-## Current Phase: Research Complete
+## Current Phase: Planning Complete
 
 **Last Updated:** 2026-01-27
 
 ---
 
+## Chosen Approach
+
+**Direct Migration (No Adapters)** - Split into two PRs:
+
+1. **PR 1:** CRUD migration to `SimpleEvaluator` endpoints
+2. **PR 2:** Run migration to native workflow invoke
+
+See [plan.md](./plan.md) for detailed implementation steps.
+
+---
+
 ## Progress Summary
 
 ### Completed
@@ -37,41 +48,45 @@
   - Service layer coupling (LOW-MEDIUM risk)
   - Created risk mitigation strategies
 
-- [x] Propose migration plan
-  - Adapter pattern approach
-  - Feature flag integration
-  - Phased rollout strategy
+- [x] Finalize migration plan
+  - Chose direct migration (no adapters)
+  - Split into PR 1 (CRUD) and PR 2 (Run)
+  - Documented all file changes needed
 
-### In Progress
+### Next Steps
 
-- [ ] Phase 1: Foundation - Not started
+- [ ] Wait for PR #3527 to be merged
+- [ ] Start PR 1: CRUD migration
+- [ ] After PR 1 stable, start PR 2: Run migration
 
-### Blocked
+---
 
-- [ ] Phase 3: Integration Testing - Blocked on PR #3527 merge
+## Key Decisions
+
+| Decision | Rationale | Date |
+|----------|-----------|------|
+| Direct migration (no adapters) | Avoids tech debt, aligns with new architecture | 2026-01-27 |
+| Two-PR approach | Keeps changes reviewable, allows CRUD to stabilize first | 2026-01-27 |
+| Internal shapes become `SimpleEvaluator` | Matches backend model, no translation layer | 2026-01-27 |
 
 ---
 
 ## Key Findings
 
-### 1. The `/evaluators/{key}/run/` endpoint works but is now a wrapper
+### 1. The `/evaluators/{key}/run/` endpoint is a thin wrapper
 
-**Important Discovery:** PR #3527 refactored the legacy run endpoint to use the native handler registry internally:
+PR #3527 refactored the legacy run endpoint to use the native handler registry internally:
 - It builds a URI from the evaluator_key: `agenta:builtin:{key}:v0`
 - Uses `retrieve_handler(uri)` to get the actual handler function
 - Directly invokes the handler
 
-**Implication:** The external interface is unchanged, but internally it uses the new architecture.
-
 ### 2. Native workflow invoke path exists
 
 There's a fully native way to run evaluators:
 - Endpoint: `POST /preview/workflows/invoke`
-- Uses `WorkflowServiceRequest` with URI in revision data
+- Uses `WorkflowServiceRequest` with URI in interface
 - Same mechanism used by batch evaluations
 
-**Recommendation:** Keep using legacy endpoint for now (simpler), consider native invoke for future custom evaluator support.
-
 ### 3. URI-based handler registry
 
 The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions:
@@ -79,48 +94,34 @@ The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions:
 - Supports custom evaluators: `user:custom:my_eval:latest`
 - Enables version management of evaluator implementations
 
-### 4. Adapter pattern minimizes risk
-
-By transforming data at the API boundary, we can:
-- Keep internal data shapes unchanged
-- Minimize code changes
-- Enable easy rollback via feature flag
-
-### 5. Output schema handling
+### 4. Key mapping changes
 
-The new `SimpleEvaluator` model includes explicit output schemas. The backend migration generates these from evaluator settings. For new configs:
-- Built-in evaluators: Schema can be derived from evaluator type
-- Custom evaluators: Schema should be provided by user
+| Legacy | New |
+|--------|-----|
+| `evaluator_key` | derived from `data.uri` |
+| `settings_values` | `data.parameters` |
+| `EvaluatorConfig` | `SimpleEvaluator` |
 
 ---
 
-## Decisions Made
-
-| Decision | Rationale | Date |
-|----------|-----------|------|
-| Use adapter pattern | Minimizes changes to internal code, enables gradual migration | 2026-01-27 |
-| Feature flag approach | Allows gradual rollout and easy rollback | 2026-01-27 |
-| Keep form structure as `settings_values` | Avoid cascading changes to form components | 2026-01-27 |
+## Open Questions
 
----
+1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix?
 
-## Open Questions
+2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type?
 
-1. **Run migration target:** For full migration, do we want the playground to invoke by:
-   - built-in key -> URI (`agenta:builtin:{key}:v0`), or
-   - evaluator revision URI stored on `SimpleEvaluator.data.uri` (preferred), or
-   - a specific evaluator revision id (even more explicit)?
-2. **Output Schema:** Confirm whether frontend must provide `data.schemas.outputs` on create/edit, or backend will derive defaults.
-3. **Slug Generation:** Client-side or server-side?
+3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`?
 
 ---
 
-## Next Steps
+## Effort Estimates
+
+| PR | Effort | Dependencies |
+|----|--------|--------------|
+| PR 1: CRUD Migration | 4-5 days | Backend PR #3527 merged |
+| PR 2: Run Migration | 3-4 days | PR 1 merged and stable |
 
-1. Wait for PR #3527 to be merged
-2. Start Phase 1: Create type definitions and adapters
-3. Add feature flag infrastructure
-4. Test with new endpoints
+**Total:** 7-9 days implementation
 
 ---
 
@@ -130,5 +131,7 @@ The new `SimpleEvaluator` model includes explicit output schemas. The backend mi
 - [context.md](./context.md) - Background and goals
 - [current-system.md](./current-system.md) - Current implementation details
 - [new-endpoints.md](./new-endpoints.md) - New endpoint documentation
+- [research.md](./research.md) - Handler registry and execution research
+- [migration-options.md](./migration-options.md) - Why we chose direct migration
 - [risk-analysis.md](./risk-analysis.md) - Coupling and risk analysis
-- [plan.md](./plan.md) - Migration execution plan
+- [plan.md](./plan.md) - Detailed implementation plan

From e3e633d75ad7bec6f8f1fb9a3833b0ffa8d4dba3 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 28 Jan 2026 12:51:51 +0100
Subject: [PATCH 3/4] feat(frontend): migrate evaluator configs CRUD

---
 .../new-endpoints.md                          |   8 +-
 .../migrate-evaluator-playground/plan.md      |  38 ++++--
 .../migrate-evaluator-playground/status.md    |   7 +-
 .../src/components/Evaluators/assets/types.ts |   4 +-
 .../src/components/Evaluators/assets/utils.ts |  38 ++++--
 .../components/ConfigureEvaluator/index.tsx   |   3 +-
 .../hooks/useEvaluatorsRegistryData.ts        |   4 +-
 .../Components/NewEvaluationModalInner.tsx    |   3 +-
 .../SelectEvaluatorSection.tsx                |  52 ++++----
 .../pages/evaluations/NewEvaluation/types.ts  |   6 +-
 .../ConfigureEvaluator/AdvancedSettings.tsx   |   2 +-
 .../ConfigureEvaluator/DebugSection.tsx       |   6 +-
 .../ConfigureEvaluator/DynamicFormField.tsx   |   2 +-
 .../ConfigureEvaluator/FieldsTagsEditor.tsx   |   2 +-
 .../ConfigureEvaluator/index.tsx              | 119 ++++++++---------
 .../ConfigureEvaluator/state/atoms.ts         |  12 +-
 .../Evaluators/DeleteModal.tsx                |   4 +-
 .../Evaluators/EvaluatorCard.tsx              |  20 +--
 .../Evaluators/EvaluatorList.tsx              |  23 ++--
 .../EvaluatorsModal/Evaluators/index.tsx      |  18 ++-
 .../EvaluatorsModal/EvaluatorsModal.tsx       |   4 +-
 .../OnlineEvaluationDrawer.tsx                |   2 +-
 .../hooks/useEvaluatorDetails.ts              |  36 +++++-
 .../hooks/useEvaluatorSelection.tsx           |  31 +++--
 .../hooks/useEvaluatorTypeFromConfigs.ts      |   7 +-
 .../utils/evaluatorDetails.ts                 |  10 +-
 web/oss/src/lib/Types.ts                      |  70 ++++++++++
 web/oss/src/lib/atoms/evaluation.ts           |   4 +-
 web/oss/src/lib/evaluators/utils.ts           |  80 ++++++++++++
 .../lib/hooks/useEvaluatorConfigs/index.ts    |   4 +-
 web/oss/src/services/evaluations/api/index.ts |   2 +-
 web/oss/src/services/evaluators/index.ts      | 120 ++++++++++++++----
 web/oss/src/state/evaluators/atoms.ts         |  24 ++--
 33 files changed, 530 insertions(+), 235 deletions(-)
 create mode 100644 web/oss/src/lib/evaluators/utils.ts

diff --git a/docs/design/migrate-evaluator-playground/new-endpoints.md b/docs/design/migrate-evaluator-playground/new-endpoints.md
index 05231c4813..97a20f01b2 100644
--- a/docs/design/migrate-evaluator-playground/new-endpoints.md
+++ b/docs/design/migrate-evaluator-playground/new-endpoints.md
@@ -259,6 +259,8 @@ Response: SimpleEvaluatorsResponse
 }
 ```
 
+**Note:** For the Evaluator Registry (automatic configs), pass `flags.is_human = false` and `include_archived = false` so archived or human evaluators don't show up.
+
 ### Create Evaluator Config
 
 **Old:**
@@ -284,7 +286,7 @@ Request: SimpleEvaluatorCreateRequest
     evaluator: {
         slug: string       # Generated from name
         name: string
-        flags: { is_evaluator: true }
+        flags: { is_evaluator: true, is_human: false }
         data: {
             uri: "agenta:builtin:{evaluator_key}:v0"
             parameters: object  # settings_values
@@ -300,6 +302,8 @@ Response: SimpleEvaluatorResponse
 }
 ```
 
+**Note:** Workflow slugs are unique per project. We append a short random suffix when generating slugs to avoid collisions when names repeat.
+
 ### Update Evaluator Config
 
 **Old:**
@@ -333,6 +337,8 @@ Request: SimpleEvaluatorEditRequest
 Response: SimpleEvaluatorResponse
 ```
 
+**Note:** `SimpleEvaluatorEdit.data` is treated as the full revision payload. When updating, include the existing `data.uri` (and any schemas) along with `data.parameters` to avoid clearing the URI.
+
 ### Delete Evaluator Config
 
 **Old:**
diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md
index a234ec2111..8a384658f9 100644
--- a/docs/design/migrate-evaluator-playground/plan.md
+++ b/docs/design/migrate-evaluator-playground/plan.md
@@ -139,14 +139,17 @@ export function buildEvaluatorUri(evaluatorKey: string): string {
 }
 
 /**
- * Generate slug from name
+ * Generate slug from name (append suffix to avoid collisions)
  */
 export function generateSlug(name: string): string {
-    return name
+    const base = name
         .toLowerCase()
         .replace(/[^a-z0-9]+/g, "-")
         .replace(/^-|-$/g, "")
-        .substring(0, 50)  // limit length
+
+    const suffix = Math.random().toString(36).slice(2, 8)
+    const maxBaseLength = Math.max(1, 50 - suffix.length - 1)
+    return `${base.slice(0, maxBaseLength)}-${suffix}`
 }
 
 // ============ CRUD Functions ============
@@ -162,7 +165,10 @@ export const fetchAllEvaluatorConfigs = async (
 
     const response = await axios.post(
         `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`,
-        { evaluator: { flags: { is_evaluator: true } } }
+        {
+            evaluator: { flags: { is_evaluator: true, is_human: false } },
+            include_archived: false,
+        }
     )
     
     return response.data?.evaluators || []
@@ -178,7 +184,7 @@ export const createEvaluatorConfig = async (
     const payload: SimpleEvaluatorCreate = {
         slug: generateSlug(name),
         name,
-        flags: { is_evaluator: true },
+        flags: { is_evaluator: true, is_human: false },
         data: {
             uri: buildEvaluatorUri(evaluatorKey),
             parameters: settingsValues,
@@ -199,15 +205,21 @@ export const createEvaluatorConfig = async (
 export const updateEvaluatorConfig = async (
     evaluatorId: string,
     updates: { name?: string; settingsValues?: Record<string, any> },
+    existing?: SimpleEvaluator,
 ): Promise<SimpleEvaluator> => {
     const {projectId} = getProjectValues()
 
+    // IMPORTANT: include existing data (uri/schemas) when editing
     const payload: SimpleEvaluatorEdit = {
         id: evaluatorId,
-        name: updates.name,
-        data: updates.settingsValues 
-            ? { parameters: updates.settingsValues }
-            : undefined,
+        name: updates.name ?? existing?.name,
+        data: {
+            ...(existing?.data ?? {}),
+            ...(updates.settingsValues ? {parameters: updates.settingsValues} : {}),
+        },
+        tags: existing?.tags,
+        meta: existing?.meta,
+        flags: existing?.flags,
     }
 
     const response = await axios.put(
@@ -338,7 +350,7 @@ form.setFieldsValue({
     settings_values: editEvalEditValues.settings_values,
 })
 
-// After
+// After (use parameters field to match SimpleEvaluator)
 form.setFieldsValue({
     name: simpleEvaluator.name,
     parameters: simpleEvaluator.data?.parameters,
@@ -354,7 +366,7 @@ Update to work with `SimpleEvaluator[]`:
 const enrichedEvaluators = evaluators.map((e) => ({
     ...e,
     evaluator_key: extractEvaluatorKeyFromUri(e.data?.uri),
-    settings_values: e.data?.parameters,  // for backward compat in UI
+    parameters: e.data?.parameters,
 }))
 ```
 
@@ -588,7 +600,7 @@ const runResponse = await createEvaluatorRunExecution(
     selectedEvaluator.key,
     {
         inputs: outputs,
-        settings: formValues.settings_values,
+        settings: formValues.parameters,
     }
 )
 
@@ -713,7 +725,7 @@ If other parts of the app use `createEvaluatorRunExecution`, update them too:
 
 ## Open Questions
 
-1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix?
+1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions.
 
 2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type?
 
diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md
index b566579b5d..dbce737e8f 100644
--- a/docs/design/migrate-evaluator-playground/status.md
+++ b/docs/design/migrate-evaluator-playground/status.md
@@ -1,6 +1,6 @@
 # Status: Evaluator Playground Migration
 
-## Current Phase: Planning Complete
+## Current Phase: PR 1 (CRUD) In Progress
 
 **Last Updated:** 2026-01-27
 
@@ -55,8 +55,7 @@ See [plan.md](./plan.md) for detailed implementation steps.
 
 ### Next Steps
 
-- [ ] Wait for PR #3527 to be merged
-- [ ] Start PR 1: CRUD migration
+- [ ] Complete PR 1: CRUD migration (stacked on PR #3527)
 - [ ] After PR 1 stable, start PR 2: Run migration
 
 ---
@@ -106,7 +105,7 @@ The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions:
 
 ## Open Questions
 
-1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix?
+1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions.
 
 2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type?
 
diff --git a/web/oss/src/components/Evaluators/assets/types.ts b/web/oss/src/components/Evaluators/assets/types.ts
index f928cdc801..ccfdfaaa06 100644
--- a/web/oss/src/components/Evaluators/assets/types.ts
+++ b/web/oss/src/components/Evaluators/assets/types.ts
@@ -1,5 +1,5 @@
 import {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 export type EvaluatorCategory = "automatic" | "human"
 
@@ -15,7 +15,7 @@ export type EvaluatorPreview = EvaluatorPreviewDto & {
     metrics?: Record<string, unknown>
 }
 
-export type EvaluatorConfigRow = EvaluatorConfig & {
+export type EvaluatorConfigRow = SimpleEvaluator & {
     evaluator?: Evaluator | null
     kind?: "config"
 }
diff --git a/web/oss/src/components/Evaluators/assets/utils.ts b/web/oss/src/components/Evaluators/assets/utils.ts
index 4b09fa2d46..a750ce248f 100644
--- a/web/oss/src/components/Evaluators/assets/utils.ts
+++ b/web/oss/src/components/Evaluators/assets/utils.ts
@@ -1,6 +1,7 @@
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import {formatDay} from "@/oss/lib/helpers/dateTimeHelper"
 import {capitalize} from "@/oss/lib/helpers/utils"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import {
     EvaluatorCategory,
@@ -54,7 +55,7 @@ const formatDate = (value?: string) => {
     return formatDay({date: value})
 }
 
-const collectConfigTags = (config: EvaluatorConfig, evaluator?: Evaluator | null) => {
+const collectConfigTags = (config: SimpleEvaluator, evaluator?: Evaluator | null) => {
     const tags = new Set<string>()
 
     if (Array.isArray(config.tags)) {
@@ -132,11 +133,12 @@ export const transformEvaluatorsToRows = (
 }
 
 const buildConfigTypeBadge = (
-    config: EvaluatorConfig,
+    config: SimpleEvaluator,
     category: Extract<EvaluatorCategory, "automatic" | "custom">,
     evaluator?: Evaluator | null,
 ): EvaluatorTypeBadge => {
-    const label = evaluator?.name || createTypeLabel(config.evaluator_key, config.name)
+    const evaluatorKey = resolveEvaluatorKey(config)
+    const label = evaluator?.name || createTypeLabel(evaluatorKey, config.name)
     const colorHex = config.color || evaluator?.color
 
     return {
@@ -146,44 +148,54 @@ const buildConfigTypeBadge = (
     }
 }
 
-const extractConfigVersion = (config: EvaluatorConfig) => {
-    const serviceValues = (config.settings_values as any)?.service || {}
+const extractConfigVersion = (config: SimpleEvaluator) => {
+    const parameters = (config.data as any)?.parameters || {}
+    const serviceValues = (config.data as any)?.service || {}
+    const serviceConfig = serviceValues?.configuration || {}
     const candidate =
         (config as any)?.version ||
         serviceValues?.agenta ||
         serviceValues?.version ||
-        (config.settings_values as any)?.version ||
+        serviceConfig?.version ||
+        serviceConfig?.agenta ||
+        parameters?.version ||
         ""
 
     return sanitizeVersion(typeof candidate === "string" ? candidate : "")
 }
 
-const extractConfigModifiedBy = (config: EvaluatorConfig) => {
+const extractConfigModifiedBy = (config: SimpleEvaluator) => {
     const modifiedBy =
         (config as any)?.updated_by ||
         (config as any)?.updatedBy ||
+        (config as any)?.updated_by_id ||
+        (config as any)?.updatedById ||
         (config as any)?.created_by ||
         (config as any)?.createdBy ||
+        (config as any)?.created_by_id ||
+        (config as any)?.createdById ||
         ""
 
     return typeof modifiedBy === "string" ? modifiedBy : ""
 }
 
 export const transformEvaluatorConfigsToRows = (
-    configs: EvaluatorConfig[],
+    configs: SimpleEvaluator[],
     category: Extract<EvaluatorCategory, "automatic">,
     evaluators: Evaluator[],
 ): EvaluatorRegistryRow[] => {
     const evaluatorsMap = new Map(evaluators.map((item) => [item.key, item]))
 
     return configs.map((config) => {
-        const evaluator = evaluatorsMap.get(config.evaluator_key) || null
+        const evaluatorKey = resolveEvaluatorKey(config)
+        const evaluator = evaluatorKey ? evaluatorsMap.get(evaluatorKey) || null : null
         const badge = buildConfigTypeBadge(config, category, evaluator)
         const versionLabel = extractConfigVersion(config)
         const tags = collectConfigTags(config, evaluator)
         const modifiedBy = extractConfigModifiedBy(config)
         const createdAt = config.created_at
         const updatedAt = config.updated_at || createdAt
+        const displayName = config.name || evaluator?.name || evaluatorKey || config.slug || ""
 
         const raw: EvaluatorConfigRow = {
             ...config,
@@ -194,15 +206,15 @@ export const transformEvaluatorConfigsToRows = (
         return {
             key: config.id,
             id: config.id,
-            name: config.name,
-            slug: config.evaluator_key,
+            name: displayName,
+            slug: evaluatorKey || config.slug,
             typeBadge: badge,
             versionLabel,
             tags,
             dateCreated: formatDate(createdAt),
             lastModified: formatDate(updatedAt),
             modifiedBy,
-            avatarName: modifiedBy || config.name,
+            avatarName: modifiedBy || displayName,
             raw,
         }
     })
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index ca07709a52..e1494219ab 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -25,6 +25,7 @@ import {
     resetPlaygroundAtom,
 } from "@/oss/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms"
 import useURL from "@/oss/hooks/useURL"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData"
 import {Evaluator} from "@/oss/lib/Types"
 import {evaluatorByKeyAtomFamily} from "@/oss/state/evaluators"
@@ -63,7 +64,7 @@ const ConfigureEvaluatorPage = ({evaluatorId}: {evaluatorId?: string | null}) =>
         )
     }, [evaluatorConfigs, evaluatorId, stagedConfig])
 
-    const evaluatorKey = existingConfig?.evaluator_key ?? evaluatorId ?? null
+    const evaluatorKey = resolveEvaluatorKey(existingConfig) ?? evaluatorId ?? null
 
     const evaluatorQuery = useAtomValue(evaluatorByKeyAtomFamily(evaluatorKey))
     const evaluatorFromRegular = evaluators.find((item) => item.key === evaluatorKey)
diff --git a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts
index 3aa171dc76..97fbb7ffc4 100644
--- a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts
+++ b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts
@@ -2,7 +2,7 @@ import {useCallback, useMemo} from "react"
 
 import useEvaluators from "@/oss/lib/hooks/useEvaluators"
 import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import {EvaluatorCategory, EvaluatorPreview, EvaluatorRegistryRow} from "../assets/types"
 import {
@@ -33,7 +33,7 @@ const useEvaluatorsRegistryData = (category: EvaluatorCategory) => {
             const humanEvaluators = (humanEvaluatorsSwr.data || []) as EvaluatorPreview[]
             unsortedRows = transformEvaluatorsToRows(humanEvaluators, "human")
         } else {
-            const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as EvaluatorConfig[]
+            const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as SimpleEvaluator[]
             const baseEvaluators = (baseEvaluatorsSwr.data || []) as Evaluator[]
 
             unsortedRows = transformEvaluatorConfigsToRows(
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
index cffdfdcd23..fd64b589ec 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
@@ -7,6 +7,7 @@ import {useRouter} from "next/router"
 import {message} from "@/oss/components/AppMessageContext"
 import useURL from "@/oss/hooks/useURL"
 import {useVaultSecret} from "@/oss/hooks/useVaultSecret"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import {redirectIfNoLLMKeys} from "@/oss/lib/helpers/utils"
 import useAppVariantRevisions from "@/oss/lib/hooks/useAppVariantRevisions"
 import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData"
@@ -289,7 +290,7 @@ const NewEvaluationModalInner = ({
             !preview &&
             selectedEvalConfigs.some(
                 (id) =>
-                    evaluatorConfigs.find((config) => config.id === id)?.evaluator_key ===
+                    resolveEvaluatorKey(evaluatorConfigs.find((config) => config.id === id)) ===
                     "auto_ai_critique",
             ) &&
             (await redirectIfNoLLMKeys({secrets}))
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx
index 3545f0b98a..b7bd3b649a 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx
@@ -11,9 +11,10 @@ import router from "next/router"
 
 import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms"
 import useURL from "@/oss/hooks/useURL"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types"
 import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import {openEvaluatorDrawerAtom} from "../../../autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms"
 import type {SelectEvaluatorSectionProps} from "../../types"
@@ -88,12 +89,12 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
 
     const evaluatorConfigs = useMemo(() => {
         if (preview) {
-            return evaluators as EvaluatorConfig[]
+            return [] as SimpleEvaluator[]
         }
         return (
             propsEvaluatorConfigs?.length ? propsEvaluatorConfigs : evaluatorConfigsSwr.data || []
-        ) as EvaluatorConfig[]
-    }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data, evaluators])
+        ) as SimpleEvaluator[]
+    }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data])
 
     const isLoadingEvaluators = fetchLoadingEvaluators
     const isLoadingEvaluatorConfigs = fetchLoadingConfigs
@@ -122,7 +123,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
         const availableIds = new Set(
             (preview
                 ? (evaluators as EvaluatorDto<"response">[])
-                : (evaluatorConfigs as EvaluatorConfig[])
+                : (evaluatorConfigs as SimpleEvaluator[])
             ).map((config) => config.id),
         )
 
@@ -141,10 +142,9 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
 
     // Handler to open the drawer in edit mode
     const handleEditConfig = useCallback(
-        (record: EvaluatorConfig) => {
-            const evaluator = (evaluators as Evaluator[]).find(
-                (e) => e.key === record.evaluator_key,
-            )
+        (record: SimpleEvaluator) => {
+            const evaluatorKey = resolveEvaluatorKey(record)
+            const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey)
             if (evaluator) {
                 openEvaluatorDrawer({
                     evaluator,
@@ -158,10 +158,9 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
 
     // Handler to open the drawer in clone mode
     const handleCloneConfig = useCallback(
-        (record: EvaluatorConfig) => {
-            const evaluator = (evaluators as Evaluator[]).find(
-                (e) => e.key === record.evaluator_key,
-            )
+        (record: SimpleEvaluator) => {
+            const evaluatorKey = resolveEvaluatorKey(record)
+            const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey)
             if (evaluator) {
                 openEvaluatorDrawer({
                     evaluator,
@@ -203,13 +202,13 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
         [],
     )
 
-    const columnsConfig: ColumnsType<EvaluatorConfig> = useMemo(
+    const columnsConfig: ColumnsType<SimpleEvaluator> = useMemo(
         () => [
             {
                 title: "Name",
                 dataIndex: "name",
                 key: "name",
-                render: (_, record: EvaluatorConfig) => {
+                render: (_, record: SimpleEvaluator) => {
                     return <div>{record.name}</div>
                 },
             },
@@ -217,10 +216,11 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
                 title: "Type",
                 dataIndex: "type",
                 key: "type",
-                render: (x, record: EvaluatorConfig) => {
+                render: (x, record: SimpleEvaluator) => {
                     // Find the evaluator by key to display its name
+                    const evaluatorKey = resolveEvaluatorKey(record)
                     const evaluator = (evaluators as Evaluator[]).find(
-                        (item) => item.key === record.evaluator_key,
+                        (item) => item.key === evaluatorKey,
                     )
                     return <Tag color={record.color}>{evaluator?.name}</Tag>
                 },
@@ -231,7 +231,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
                 width: 56,
                 fixed: "right",
                 align: "center",
-                render: (_, record: EvaluatorConfig) => {
+                render: (_, record: SimpleEvaluator) => {
                     return (
                         <Dropdown
                             trigger={["click"]}
@@ -276,7 +276,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
     // Conditionally type filteredEvalConfigs based on Preview
     const filteredEvalConfigs: Preview extends true
         ? EvaluatorDto<"response">[]
-        : EvaluatorConfig[] = useMemo(() => {
+        : SimpleEvaluator[] = useMemo(() => {
         if (preview) {
             // Explicitly narrow types for Preview = true (human evaluations)
             let data = evaluators as EvaluatorDto<"response">[]
@@ -295,21 +295,21 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
 
             if (!searchTerm) return data as any
             return data.filter((item) =>
-                item.name.toLowerCase().includes(searchTerm.toLowerCase()),
+                (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()),
             ) as any
         } else {
             // Explicitly narrow types for Preview = false
-            const data = evaluatorConfigs as EvaluatorConfig[]
+            const data = evaluatorConfigs as SimpleEvaluator[]
             if (!searchTerm) return data
             return data.filter((item) =>
-                item.name.toLowerCase().includes(searchTerm.toLowerCase()),
+                (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()),
             ) as any
         }
     }, [searchTerm, evaluatorConfigs, preview, evaluators])
 
     const onSelectEvalConfig = (selectedRowKeys: React.Key[]) => {
         const currentSelected = new Set(selectedEvalConfigs)
-        const configs = filteredEvalConfigs as EvaluatorDto<"response">[]
+        const configs = filteredEvalConfigs as {id: string}[]
         configs.forEach((item) => {
             if (selectedRowKeys.includes(item.id)) {
                 currentSelected.add(item.id)
@@ -331,7 +331,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
                 ).length > 0
             )
         }
-        return (evaluatorConfigs as EvaluatorConfig[]).length > 0
+        return (evaluatorConfigs as SimpleEvaluator[]).length > 0
     }, [preview, evaluators, evaluatorConfigs])
 
     return (
@@ -418,7 +418,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
                         pagination={false}
                     />
                 ) : (
-                    <Table<EvaluatorConfig>
+                    <Table<SimpleEvaluator>
                         rowSelection={{
                             type: "checkbox",
                             columnWidth: 48,
@@ -442,7 +442,7 @@ const SelectEvaluatorSection = <Preview extends boolean = false>({
                         className="ph-no-capture"
                         columns={columnsConfig}
                         rowKey={"id"}
-                        dataSource={filteredEvalConfigs as EvaluatorConfig[]}
+                        dataSource={filteredEvalConfigs as SimpleEvaluator[]}
                         scroll={{x: true, y: 455}}
                         bordered
                         pagination={false}
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts
index a068971bc4..5f838f8665 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts
@@ -4,7 +4,7 @@ import {ModalProps} from "antd"
 
 import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types"
 import {EnhancedVariant} from "@/oss/lib/shared/variant/transformer/types"
-import {LLMRunRateLimit, Evaluator, EvaluatorConfig, testset} from "@/oss/lib/Types"
+import {LLMRunRateLimit, Evaluator, SimpleEvaluator, testset} from "@/oss/lib/Types"
 
 export interface NewEvaluationAppOption {
     label: string
@@ -54,7 +54,7 @@ export interface NewEvaluationModalContentProps extends HTMLProps<HTMLDivElement
     variants?: EnhancedVariant[]
     variantsLoading?: boolean
     evaluators: Evaluator[] | EvaluatorDto<"response">[]
-    evaluatorConfigs: EvaluatorConfig[]
+    evaluatorConfigs: SimpleEvaluator[]
     advanceSettings: LLMRunRateLimitWithCorrectAnswer
     setAdvanceSettings: Dispatch<SetStateAction<LLMRunRateLimitWithCorrectAnswer>>
     appOptions: NewEvaluationAppOption[]
@@ -95,7 +95,7 @@ export interface SelectTestsetSectionProps extends HTMLProps<HTMLDivElement> {
 }
 
 export interface SelectEvaluatorSectionProps extends HTMLProps<HTMLDivElement> {
-    evaluatorConfigs: EvaluatorConfig[]
+    evaluatorConfigs: SimpleEvaluator[]
     evaluators: Evaluator[]
     selectedEvalConfigs: string[]
     setSelectedEvalConfigs: Dispatch<SetStateAction<string[]>>
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx
index 6957d3438a..6a0aed5f8f 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx
@@ -71,7 +71,7 @@ const AdvancedSettings: React.FC<AdvancedSettingsProps> = ({settings, selectedTe
                     return (
                         <Form.Item
                             key={field.key}
-                            name={["settings_values", field.key]}
+                            name={["parameters", field.key]}
                             initialValue={field.default}
                             rules={rules}
                             label={label}
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx
index 5de982d585..49ef6e2f85 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx
@@ -372,8 +372,8 @@ const DebugSection = () => {
             setEvalOutputStatus({success: false, error: false})
             setIsLoadingResult(true)
 
-            const settingsValues = form.getFieldValue("settings_values") || {}
-            let normalizedSettings = {...settingsValues}
+            const parameters = form.getFieldValue("parameters") || {}
+            let normalizedSettings = {...parameters}
 
             if (typeof normalizedSettings.json_schema === "string") {
                 try {
@@ -419,7 +419,7 @@ const DebugSection = () => {
             }
 
             if (!selectedEvaluator.key.startsWith("rag_")) {
-                const correctAnswerKey = settingsValues.correct_answer_key
+                const correctAnswerKey = parameters.correct_answer_key
                 const groundTruthKey =
                     typeof correctAnswerKey === "string" && correctAnswerKey.startsWith("testcase.")
                         ? correctAnswerKey.split(".")[1]
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
index a8128c43e7..c7a3df73f6 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
@@ -105,7 +105,7 @@ export const DynamicFormField: React.FC<DynamicFormFieldProps> = ({
     form,
 }) => {
     const settingsValue = Form.useWatch(name, form)
-    const runtime = Form.useWatch(["settings_values", "runtime"], form)
+    const runtime = Form.useWatch(["parameters", "runtime"], form)
 
     const classes = useStyles()
     const {token} = theme.useToken()
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
index a96a07a37f..f5ddf000df 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
@@ -55,7 +55,7 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
 
     // Watch the correct_answer_key from form to react to changes
     // Using Form.useWatch instead of form.getFieldValue for reactivity
-    const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form)
+    const formCorrectAnswerKey = Form.useWatch(["parameters", "correct_answer_key"], form)
     const effectiveKey = formCorrectAnswerKey || correctAnswerKey
 
     // Check if we can detect fields from testcase
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx
index 331afe0852..1454b99565 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx
@@ -13,7 +13,7 @@ import {useAppId} from "@/oss/hooks/useAppId"
 import useURL from "@/oss/hooks/useURL"
 import {EvaluationSettingsTemplate, JSSTheme, SettingsPreset} from "@/oss/lib/Types"
 import {
-    CreateEvaluationConfigData,
+    CreateEvaluatorConfigData,
     createEvaluatorConfig,
     updateEvaluatorConfig,
 } from "@/oss/services/evaluations/api"
@@ -69,6 +69,13 @@ interface ConfigureEvaluatorProps {
     onToggleTestPanel?: () => void
 }
 
+interface ConfigureEvaluatorFormValues {
+    name: string
+    description?: string
+    tags?: string[]
+    parameters?: Record<string, any>
+}
+
 const useStyles = createUseStyles((theme: JSSTheme) => ({
     collapseContainer: {
         "& .ant-collapse-header": {
@@ -199,12 +206,10 @@ const ConfigureEvaluator = ({
             const allKeys = Array.from(new Set([...templateKeys, ...presetKeys]))
 
             // Clear subtree before applying new values to avoid stale keys
-            form.setFieldsValue({settings_values: {}})
+            form.setFieldsValue({parameters: {}})
 
             if (allKeys.length) {
-                const fieldNames = allKeys.map(
-                    (key) => ["settings_values", key] as (string | number)[],
-                )
+                const fieldNames = allKeys.map((key) => ["parameters", key] as (string | number)[])
                 form.resetFields(fieldNames)
 
                 const nextFields = fieldNames
@@ -248,7 +253,7 @@ const ConfigureEvaluator = ({
 
     const evaluatorVersionNumber = useMemo(() => {
         const raw =
-            editEvalEditValues?.settings_values?.version ??
+            editEvalEditValues?.data?.parameters?.version ??
             selectedEvaluator?.settings_template?.version?.default ??
             3
 
@@ -256,7 +261,7 @@ const ConfigureEvaluator = ({
         // extract leading number (e.g., "4", "4.1", "v4")
         const match = String(raw).match(/\d+(\.\d+)?/)
         return match ? parseFloat(match[0]) : 3
-    }, [editEvalEditValues?.settings_values?.version, selectedEvaluator])
+    }, [editEvalEditValues?.data?.parameters?.version, selectedEvaluator])
 
     const evalFields = useMemo(() => {
         const templateEntries = Object.entries(selectedEvaluator?.settings_template || {})
@@ -283,28 +288,25 @@ const ConfigureEvaluator = ({
     const advancedSettingsFields = evalFields.filter((field) => field.advanced)
     const basicSettingsFields = evalFields.filter((field) => !field.advanced)
 
-    const onSubmit = async (values: CreateEvaluationConfigData) => {
+    const onSubmit = async (values: ConfigureEvaluatorFormValues) => {
         try {
             setSubmitLoading(true)
             if (!selectedEvaluator?.key) throw new Error("No selected key")
-            const settingsValues = values.settings_values || {}
+            const parameters = values.parameters || {}
 
-            const jsonSchemaFieldPath: (string | number)[] = ["settings_values", "json_schema"]
-            const hasJsonSchema = Object.prototype.hasOwnProperty.call(
-                settingsValues,
-                "json_schema",
-            )
+            const jsonSchemaFieldPath: (string | number)[] = ["parameters", "json_schema"]
+            const hasJsonSchema = Object.prototype.hasOwnProperty.call(parameters, "json_schema")
 
             if (hasJsonSchema) {
                 form.setFields([{name: jsonSchemaFieldPath, errors: []}])
 
-                if (typeof settingsValues.json_schema === "string") {
+                if (typeof parameters.json_schema === "string") {
                     try {
-                        const parsed = JSON.parse(settingsValues.json_schema)
+                        const parsed = JSON.parse(parameters.json_schema)
                         if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
                             throw new Error()
                         }
-                        settingsValues.json_schema = parsed
+                        parameters.json_schema = parsed
                     } catch {
                         form.setFields([
                             {
@@ -315,9 +317,9 @@ const ConfigureEvaluator = ({
                         throw new Error("JSON schema must be a valid JSON object")
                     }
                 } else if (
-                    settingsValues.json_schema &&
-                    (typeof settingsValues.json_schema !== "object" ||
-                        Array.isArray(settingsValues.json_schema))
+                    parameters.json_schema &&
+                    (typeof parameters.json_schema !== "object" ||
+                        Array.isArray(parameters.json_schema))
                 ) {
                     form.setFields([
                         {
@@ -329,40 +331,43 @@ const ConfigureEvaluator = ({
                 }
             }
 
-            const data = {
-                ...values,
-                evaluator_key: selectedEvaluator!.key,
-                settings_values: settingsValues,
+            const existingParameters = editEvalEditValues?.data?.parameters || {}
+            const mergedParameters = {...existingParameters, ...parameters}
+
+            const payload: CreateEvaluatorConfigData = {
+                name: values.name,
+                description: values.description,
+                tags: values.tags,
+                evaluator_key: selectedEvaluator.key,
+                parameters,
             }
 
             if (editMode) {
-                await updateEvaluatorConfig(editEvalEditValues?.id!, data)
-
-                // Update atom with merged values
-                const updatedConfig = editEvalEditValues
-                    ? {
-                          ...editEvalEditValues,
-                          ...data,
-                          settings_values: settingsValues,
-                      }
-                    : null
-                if (updatedConfig) {
-                    commitPlayground(updatedConfig)
-                }
+                const updatedEvaluator = await updateEvaluatorConfig(editEvalEditValues?.id!, {
+                    id: editEvalEditValues?.id!,
+                    name: values.name,
+                    description: editEvalEditValues?.description,
+                    tags: editEvalEditValues?.tags,
+                    meta: editEvalEditValues?.meta,
+                    flags: editEvalEditValues?.flags,
+                    data: {
+                        ...(editEvalEditValues?.data ?? {}),
+                        parameters: mergedParameters,
+                    },
+                })
+
+                commitPlayground(updatedEvaluator)
             } else {
-                const response = await createEvaluatorConfig(appId, data)
-                const createdConfig = response?.data
-
-                if (createdConfig) {
-                    // Use commitPlayground to update state and switch to edit mode
-                    commitPlayground(createdConfig)
-                    if (uiVariant === "page" && createdConfig.id) {
-                        await router.replace(
-                            `${projectURL}/evaluators/configure/${encodeURIComponent(
-                                createdConfig.id,
-                            )}`,
-                        )
-                    }
+                const createdConfig = await createEvaluatorConfig(appId, payload)
+
+                // Use commitPlayground to update state and switch to edit mode
+                commitPlayground(createdConfig)
+                if (uiVariant === "page" && createdConfig.id) {
+                    await router.replace(
+                        `${projectURL}/evaluators/configure/${encodeURIComponent(
+                            createdConfig.id,
+                        )}`,
+                    )
                 }
             }
 
@@ -381,15 +386,15 @@ const ConfigureEvaluator = ({
         form.resetFields()
 
         if (editMode && editEvalEditValues) {
-            // Load all values including nested settings_values
+            // Load all values including nested parameters
             form.setFieldsValue({
                 ...editEvalEditValues,
-                settings_values: editEvalEditValues.settings_values || {},
+                parameters: editEvalEditValues.data?.parameters || {},
             })
         } else if (cloneConfig && editEvalEditValues) {
-            // When cloning, copy only settings_values and clear the name so user provides a new name
+            // When cloning, copy only parameters and clear the name so user provides a new name
             form.setFieldsValue({
-                settings_values: editEvalEditValues.settings_values || {},
+                parameters: editEvalEditValues.data?.parameters || {},
                 name: "",
             })
         } else if (selectedEvaluator?.settings_template) {
@@ -404,7 +409,7 @@ const ConfigureEvaluator = ({
             }
             if (Object.keys(defaultSettings).length > 0) {
                 form.setFieldsValue({
-                    settings_values: defaultSettings,
+                    parameters: defaultSettings,
                 })
             }
         }
@@ -556,7 +561,7 @@ const ConfigureEvaluator = ({
                                                     key={field.key}
                                                     traceTree={traceTree}
                                                     form={form}
-                                                    name={["settings_values", field.key]}
+                                                    name={["parameters", field.key]}
                                                 />
                                             ))}
                                         </div>
@@ -674,7 +679,7 @@ const ConfigureEvaluator = ({
                                                     key={field.key}
                                                     traceTree={traceTree}
                                                     form={form}
-                                                    name={["settings_values", field.key]}
+                                                    name={["parameters", field.key]}
                                                 />
                                             ))}
                                         </div>
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts
index 76b8c134c2..dcb15dcd42 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts
@@ -18,7 +18,7 @@ import type {FormInstance} from "antd"
 import {atom} from "jotai"
 import {atomWithReset, atomWithStorage, RESET} from "jotai/utils"
 
-import type {Evaluator, EvaluatorConfig, Variant} from "@/oss/lib/Types"
+import type {Evaluator, SimpleEvaluator, Variant} from "@/oss/lib/Types"
 import {stringStorage} from "@/oss/state/utils/stringStorage"
 
 // ================================================================
@@ -84,7 +84,7 @@ export const playgroundIsCloneModeAtom = atom((get) => get(playgroundSessionAtom
  * - In edit mode: loaded from existing config
  * - In clone mode: copied from source config (with cleared name)
  */
-export const playgroundEditValuesAtom = atomWithReset<EvaluatorConfig | null>(null)
+export const playgroundEditValuesAtom = atomWithReset<SimpleEvaluator | null>(null)
 
 // ================================================================
 // FORM STATE
@@ -95,7 +95,7 @@ export const playgroundEditValuesAtom = atomWithReset<EvaluatorConfig | null>(nu
  * Allows DebugSection to read form values for running the evaluator
  *
  * This is set by ConfigureEvaluator when the form mounts
- * and read by DebugSection to get current settings_values
+ * and read by DebugSection to get current parameters
  */
 export const playgroundFormRefAtom = atom<FormInstance | null>(null)
 
@@ -179,7 +179,7 @@ export const initPlaygroundAtom = atom(
         set,
         payload: {
             evaluator: Evaluator
-            existingConfig?: EvaluatorConfig | null
+            existingConfig?: SimpleEvaluator | null
             mode?: PlaygroundMode
         },
     ) => {
@@ -226,7 +226,7 @@ export const resetPlaygroundAtom = atom(null, (get, set) => {
  *
  * @param savedConfig - The config returned from the API
  */
-export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: EvaluatorConfig) => {
+export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: SimpleEvaluator) => {
     // Update edit values with saved config
     set(playgroundEditValuesAtom, savedConfig)
 
@@ -280,7 +280,7 @@ export const openEvaluatorDrawerAtom = atom(
         set,
         payload: {
             evaluator: Evaluator
-            existingConfig?: EvaluatorConfig | null
+            existingConfig?: SimpleEvaluator | null
             mode?: PlaygroundMode
         },
     ) => {
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx
index 0ac235b386..c30bb3c1f1 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx
@@ -5,11 +5,11 @@ import {Modal, Space, theme, Typography} from "antd"
 import {createUseStyles} from "react-jss"
 
 import {checkIfResourceValidForDeletion} from "@/oss/lib/evaluations/legacy"
-import {EvaluatorConfig, JSSTheme} from "@/oss/lib/Types"
+import {JSSTheme, SimpleEvaluator} from "@/oss/lib/Types"
 import {deleteEvaluatorConfig} from "@/oss/services/evaluations/api"
 
 type DeleteModalProps = {
-    selectedEvalConfig: EvaluatorConfig
+    selectedEvalConfig: SimpleEvaluator
     onSuccess: () => void
 } & React.ComponentProps<typeof Modal>
 
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx
index f3c9434a38..72aaf034fc 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx
@@ -7,18 +7,19 @@ import {useAtom} from "jotai"
 import {createUseStyles} from "react-jss"
 
 import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import {formatDay} from "@/oss/lib/helpers/dateTimeHelper"
-import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types"
+import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types"
 
 import DeleteModal from "./DeleteModal"
 
 interface EvaluatorCardProps {
-    evaluatorConfigs: EvaluatorConfig[]
+    evaluatorConfigs: SimpleEvaluator[]
     setEditMode: React.Dispatch<React.SetStateAction<boolean>>
     setCloneConfig: React.Dispatch<React.SetStateAction<boolean>>
     setCurrent: React.Dispatch<React.SetStateAction<number>>
     setSelectedEvaluator: React.Dispatch<React.SetStateAction<Evaluator | null>>
-    setEditEvalEditValues: React.Dispatch<React.SetStateAction<EvaluatorConfig | null>>
+    setEditEvalEditValues: React.Dispatch<React.SetStateAction<SimpleEvaluator | null>>
     onSuccess: () => void
 }
 
@@ -88,22 +89,21 @@ const EvaluatorCard = ({
     const classes = useStyles()
     const evaluators = useAtom(evaluatorsAtom)[0]
     const [openDeleteModal, setOpenDeleteModal] = useState(false)
-    const [selectedDelEval, setSelectedDelEval] = useState<EvaluatorConfig | null>(null)
+    const [selectedDelEval, setSelectedDelEval] = useState<SimpleEvaluator | null>(null)
 
     return (
         <div className={classes.container}>
             {evaluatorConfigs.length ? (
                 evaluatorConfigs.map((item) => {
-                    const evaluator = evaluators.find((e) => e.key === item.evaluator_key)
+                    const evaluatorKey = resolveEvaluatorKey(item)
+                    const evaluator = evaluators.find((e) => e.key === evaluatorKey)
 
                     return (
                         <Card
                             key={item.id}
                             className={classes.evaluatorCard}
                             onClick={() => {
-                                const selectedEval = evaluators.find(
-                                    (e) => e.key === item.evaluator_key,
-                                )
+                                const selectedEval = evaluators.find((e) => e.key === evaluatorKey)
                                 if (selectedEval) {
                                     setEditMode(true)
                                     setSelectedEvaluator(selectedEval)
@@ -130,7 +130,7 @@ const EvaluatorCard = ({
                                                 onClick: (e: any) => {
                                                     e.domEvent.stopPropagation()
                                                     const selectedEval = evaluators.find(
-                                                        (e) => e.key === item.evaluator_key,
+                                                        (e) => e.key === evaluatorKey,
                                                     )
                                                     if (selectedEval) {
                                                         setEditMode(true)
@@ -147,7 +147,7 @@ const EvaluatorCard = ({
                                                 onClick: (e: any) => {
                                                     e.domEvent.stopPropagation()
                                                     const selectedEval = evaluators.find(
-                                                        (e) => e.key === item.evaluator_key,
+                                                        (e) => e.key === evaluatorKey,
                                                     )
                                                     if (selectedEval) {
                                                         setCloneConfig(true)
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx
index 2e38bfd1c2..33c03a9f89 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx
@@ -7,17 +7,18 @@ import {ColumnsType} from "antd/es/table"
 import {useAtom} from "jotai"
 
 import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import DeleteModal from "./DeleteModal"
 
 interface EvaluatorListProps {
-    evaluatorConfigs: EvaluatorConfig[]
+    evaluatorConfigs: SimpleEvaluator[]
     setEditMode: React.Dispatch<React.SetStateAction<boolean>>
     setCloneConfig: React.Dispatch<React.SetStateAction<boolean>>
     setCurrent: React.Dispatch<React.SetStateAction<number>>
     setSelectedEvaluator: React.Dispatch<React.SetStateAction<Evaluator | null>>
-    setEditEvalEditValues: React.Dispatch<React.SetStateAction<EvaluatorConfig | null>>
+    setEditEvalEditValues: React.Dispatch<React.SetStateAction<SimpleEvaluator | null>>
     onSuccess: () => void
 }
 
@@ -32,9 +33,9 @@ const EvaluatorList = ({
 }: EvaluatorListProps) => {
     const evaluators = useAtom(evaluatorsAtom)[0]
     const [openDeleteModal, setOpenDeleteModal] = useState(false)
-    const [selectedDelEval, setSelectedDelEval] = useState<EvaluatorConfig | null>(null)
+    const [selectedDelEval, setSelectedDelEval] = useState<SimpleEvaluator | null>(null)
 
-    const columns: ColumnsType<EvaluatorConfig> = [
+    const columns: ColumnsType<SimpleEvaluator> = [
         // {
         //     title: "Version",
         //     dataIndex: "version",
@@ -56,7 +57,8 @@ const EvaluatorList = ({
             dataIndex: "type",
             key: "type",
             render: (_, record) => {
-                const evaluator = evaluators.find((item) => item.key === record.evaluator_key)
+                const evaluatorKey = resolveEvaluatorKey(record)
+                const evaluator = evaluators.find((item) => item.key === evaluatorKey)
                 return <Tag color={record.color}>{evaluator?.name}</Tag>
             },
         },
@@ -84,8 +86,9 @@ const EvaluatorList = ({
                                     icon: <Note size={16} />,
                                     onClick: (e: any) => {
                                         e.domEvent.stopPropagation()
+                                        const evaluatorKey = resolveEvaluatorKey(record)
                                         const selectedEval = evaluators.find(
-                                            (e) => e.key === record.evaluator_key,
+                                            (e) => e.key === evaluatorKey,
                                         )
                                         if (selectedEval) {
                                             setEditMode(true)
@@ -101,8 +104,9 @@ const EvaluatorList = ({
                                     icon: <Copy size={16} />,
                                     onClick: (e: any) => {
                                         e.domEvent.stopPropagation()
+                                        const evaluatorKey = resolveEvaluatorKey(record)
                                         const selectedEval = evaluators.find(
-                                            (e) => e.key === record.evaluator_key,
+                                            (e) => e.key === evaluatorKey,
                                         )
                                         if (selectedEval) {
                                             setCloneConfig(true)
@@ -151,7 +155,8 @@ const EvaluatorList = ({
                 onRow={(record) => ({
                     style: {cursor: "pointer"},
                     onClick: () => {
-                        const selectedEval = evaluators.find((e) => e.key === record.evaluator_key)
+                        const evaluatorKey = resolveEvaluatorKey(record)
+                        const selectedEval = evaluators.find((e) => e.key === evaluatorKey)
                         if (selectedEval) {
                             setEditMode(true)
                             setSelectedEvaluator(selectedEval)
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx
index 60569766c2..564bc38df9 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx
@@ -8,21 +8,22 @@ import {createUseStyles} from "react-jss"
 
 import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation"
 import {getEvaluatorTags} from "@/oss/lib/evaluations/legacy"
-import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
+import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types"
 import {nonArchivedEvaluatorsAtom} from "@/oss/state/evaluators"
 
 import EvaluatorCard from "./EvaluatorCard"
 import EvaluatorList from "./EvaluatorList"
 
 interface EvaluatorsProps {
-    evaluatorConfigs: EvaluatorConfig[]
+    evaluatorConfigs: SimpleEvaluator[]
     handleOnCancel: () => void
     setCurrent: React.Dispatch<React.SetStateAction<number>>
     setSelectedEvaluator: React.Dispatch<React.SetStateAction<Evaluator | null>>
     fetchingEvalConfigs: boolean
     setEditMode: React.Dispatch<React.SetStateAction<boolean>>
     setCloneConfig: React.Dispatch<React.SetStateAction<boolean>>
-    setEditEvalEditValues: React.Dispatch<React.SetStateAction<EvaluatorConfig | null>>
+    setEditEvalEditValues: React.Dispatch<React.SetStateAction<SimpleEvaluator | null>>
     onSuccess: () => void
     setEvaluatorsDisplay: any
     evaluatorsDisplay: string
@@ -95,10 +96,13 @@ const Evaluators = ({
 
     const updatedEvaluatorConfigs = useMemo(() => {
         return evaluatorConfigs.map((config) => {
-            const matchingEvaluator = evaluators.find(
-                (evaluator) => evaluator.key === config.evaluator_key,
+            const evaluatorKey = resolveEvaluatorKey(config)
+            const matchingEvaluator = evaluators.find((evaluator) => evaluator.key === evaluatorKey)
+            if (!matchingEvaluator) return config
+            const nextTags = Array.from(
+                new Set([...(config.tags || []), ...(matchingEvaluator.tags || [])]),
             )
-            return matchingEvaluator ? {...config, tags: matchingEvaluator.tags} : config
+            return {...config, tags: nextTags}
         })
     }, [evaluatorConfigs, evaluators])
 
@@ -111,7 +115,7 @@ const Evaluators = ({
 
         if (searchTerm) {
             filtered = filtered.filter((item) =>
-                item.name.toLowerCase().includes(searchTerm.toLowerCase()),
+                (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()),
             )
         }
 
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx
index b89da2ee19..c06202394c 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx
@@ -9,7 +9,7 @@ import EnhancedModal from "@/oss/components/EnhancedUIs/Modal"
 import {useAppId} from "@/oss/hooks/useAppId"
 import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation"
 import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import ConfigureEvaluator from "./ConfigureEvaluator"
 import {initPlaygroundAtom, resetPlaygroundAtom} from "./ConfigureEvaluator/state/atoms"
@@ -39,7 +39,7 @@ const EvaluatorsModal = ({
         useFetchEvaluatorsData({appId: appId ?? ""})
     const [editMode, setEditMode] = useState(false)
     const [cloneConfig, setCloneConfig] = useState(false)
-    const [editEvalEditValues, setEditEvalEditValues] = useState<EvaluatorConfig | null>(null)
+    const [editEvalEditValues, setEditEvalEditValues] = useState<SimpleEvaluator | null>(null)
     const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage<"card" | "list">(
         "evaluator_view",
         "list",
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx
index 9687626c3c..d0281e7539 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx
@@ -63,7 +63,7 @@ const OnlineEvaluationDrawer = ({open, onClose, onCreate}: OnlineEvaluationDrawe
     const filterColumns = useMemo(() => getFilterColumns(), [])
     const [filters, setFilters] = useAtom(onlineEvalFiltersAtom)
     const resetFilters = useSetAtom(resetOnlineEvalFiltersAtom)
-    // Load preview evaluators (with IDs) to map evaluator_config.evaluator_key -> evaluator.id
+    // Load preview evaluators (with IDs) to map config URI key -> evaluator.id
     const previewEvaluatorsSwr = useEvaluators({preview: true, queries: {is_human: false}})
     const baseEvaluators = (baseEvaluatorsSwr.data as Evaluator[] | undefined) ?? []
     const evaluators = useAtomValue(evaluatorConfigsAtom)
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
index a49787e814..42612e4322 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
@@ -1,4 +1,5 @@
 import {useMemo} from "react"
+import {useMemo} from "react"
 
 import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types"
 
@@ -47,18 +48,41 @@ const mergeEvaluatorWithConfig = (
         ...configAny,
     }
 
-    const previewSettings = isPlainObject(evaluatorAny.settings_values)
-        ? (evaluatorAny.settings_values as Record<string, unknown>)
+    const previewData = isPlainObject(evaluatorAny.data)
+        ? (evaluatorAny.data as Record<string, unknown>)
         : undefined
-    const configSettings = isPlainObject(configAny.settings_values)
-        ? (configAny.settings_values as Record<string, unknown>)
+    const configData = isPlainObject(configAny.data)
+        ? (configAny.data as Record<string, unknown>)
         : undefined
+    if (previewData || configData) {
+        const mergedData: Record<string, unknown> = {
+            ...(previewData ?? {}),
+            ...(configData ?? {}),
+        }
 
-    if (previewSettings || configSettings) {
-        merged.settings_values = {
+        const previewParameters = isPlainObject(previewData?.parameters)
+            ? (previewData?.parameters as Record<string, unknown>)
+            : undefined
+        const configParameters = isPlainObject(configData?.parameters)
+            ? (configData?.parameters as Record<string, unknown>)
+            : undefined
+        const previewSettings = isPlainObject(evaluatorAny.settings_values)
+            ? (evaluatorAny.settings_values as Record<string, unknown>)
+            : undefined
+        const configSettings = isPlainObject(configAny.settings_values)
+            ? (configAny.settings_values as Record<string, unknown>)
+            : undefined
+        const mergedParameters = {
+            ...(previewParameters ?? {}),
             ...(previewSettings ?? {}),
+            ...(configParameters ?? {}),
             ...(configSettings ?? {}),
         }
+        if (Object.keys(mergedParameters).length) {
+            mergedData.parameters = mergedParameters
+        }
+
+        merged.data = mergedData
     }
 
     return merged as EvaluatorPreviewDto
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
index af624b9f83..c1cf18acd2 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
@@ -1,9 +1,11 @@
 import {useMemo} from "react"
+import {useMemo} from "react"
 
 import {SelectProps} from "antd"
 
+import {getEvaluatorParameters, resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types"
-import type {Evaluator} from "@/oss/lib/Types"
+import type {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 
 import {
     ALLOWED_ONLINE_EVALUATOR_KEYS,
@@ -13,7 +15,7 @@ import {
 import {capitalize, collectEvaluatorCandidates} from "../utils/evaluatorDetails"
 
 interface UseEvaluatorSelectionParams {
-    evaluators: any[]
+    evaluators: SimpleEvaluator[]
     selectedEvaluatorId: string | undefined
     previewEvaluators: EvaluatorPreviewDto[]
     baseEvaluators: Evaluator[]
@@ -21,16 +23,17 @@ interface UseEvaluatorSelectionParams {
 
 interface EvaluatorSelectionResult {
     evaluatorOptions: SelectProps["options"]
-    selectedEvaluatorConfig?: any
+    selectedEvaluatorConfig?: SimpleEvaluator
     matchedPreviewEvaluator?: EvaluatorPreviewDto
     evaluatorTypeLookup: Map<string, {slug: string; label: string}>
 }
 
-const buildEvaluatorOptions = (configs: any[]): SelectProps["options"] =>
+const buildEvaluatorOptions = (configs: SimpleEvaluator[]): SelectProps["options"] =>
     (configs || []).map((cfg: any) => {
         const iconSrc = (cfg?.icon_url && (cfg.icon_url.src || cfg.icon_url)) || undefined
         const displayName = cfg?.name || ""
-        const searchable = [displayName, cfg?.evaluator_key, cfg?.id]
+        const evaluatorKey = resolveEvaluatorKey(cfg)
+        const searchable = [displayName, evaluatorKey, cfg?.id, cfg?.slug, cfg?.data?.uri]
             .map((item) => {
                 if (item === undefined || item === null) return undefined
                 const text = String(item).trim()
@@ -61,6 +64,7 @@ const buildPreviewLookup = (previewEvaluators: EvaluatorPreviewDto[]) => {
     const map = new Map<string, EvaluatorPreviewDto>()
     previewEvaluators.forEach((evaluator) => {
         const rawKey =
+            resolveEvaluatorKey(evaluator as any) ||
             (evaluator as any)?.evaluator_key ||
             (evaluator as any)?.flags?.evaluator_key ||
             (evaluator as any)?.meta?.evaluator_key ||
@@ -122,13 +126,14 @@ export const useEvaluatorSelection = ({
 
     const allowedEvaluators = useMemo(() => {
         if (!evaluators?.length) return []
-        return evaluators.filter((config: any) => {
+        return evaluators.filter((config: SimpleEvaluator) => {
             if (!config) return false
+            const evaluatorKey = resolveEvaluatorKey(config)
             const candidates = collectEvaluatorCandidates(
-                config?.evaluator_key,
-                (config as any)?.slug,
+                evaluatorKey,
+                config?.slug,
                 config?.name,
-                config?.key,
+                (config as any)?.key,
                 config?.meta?.evaluator_key,
                 config?.meta?.key,
             )
@@ -141,13 +146,13 @@ export const useEvaluatorSelection = ({
         if (!allowedEvaluators.length) return []
         if (!ENABLE_CORRECT_ANSWER_KEY_FILTER) return allowedEvaluators
         const requiringKey = evaluatorsRequiringCorrectAnswerKey ?? new Set<string>()
-        return allowedEvaluators.filter((config: any) => {
+        return allowedEvaluators.filter((config: SimpleEvaluator) => {
             if (!config) return false
-            const evaluatorKey = config?.evaluator_key
+            const evaluatorKey = resolveEvaluatorKey(config)
             if (evaluatorKey && requiringKey.has(evaluatorKey)) {
                 return false
             }
-            const settingsValues = config?.settings_values || {}
+            const settingsValues = getEvaluatorParameters(config)
             const requiresCorrectAnswerKey = Object.entries(settingsValues).some(([key, value]) => {
                 if (!key) return false
                 const normalizedKey = key.toLowerCase()
@@ -176,7 +181,7 @@ export const useEvaluatorSelection = ({
     const previewLookup = useMemo(() => buildPreviewLookup(previewEvaluators), [previewEvaluators])
 
     const matchedPreviewEvaluator = useMemo(() => {
-        const key = (selectedEvaluatorConfig as any)?.evaluator_key as string | undefined
+        const key = resolveEvaluatorKey(selectedEvaluatorConfig)
         if (!key) return undefined
         return previewLookup.get(key.toLowerCase())
     }, [selectedEvaluatorConfig, previewLookup])
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
index fb54e0978b..9cab865352 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
@@ -1,8 +1,10 @@
 import {useMemo} from "react"
+import {useMemo} from "react"
 
 import {useAtomValue} from "jotai"
 
 import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation"
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import useEvaluatorConfigs from "@/oss/lib/hooks/useEvaluatorConfigs"
 
 import {EVALUATOR_CATEGORY_LABEL_MAP} from "../constants"
@@ -25,6 +27,7 @@ export const useEvaluatorTypeFromConfigs = ({
         }
 
         const candidates = collectEvaluatorCandidates(
+            resolveEvaluatorKey(evaluator as any),
             (evaluator as any)?.slug,
             (evaluator as any)?.key,
             (evaluator as any)?.meta?.evaluator_key,
@@ -32,7 +35,7 @@ export const useEvaluatorTypeFromConfigs = ({
         )
 
         const match = configs.find((cfg) => {
-            const key = (cfg?.evaluator_key || cfg?.name || cfg?.id || "").toString().trim()
+            const key = (resolveEvaluatorKey(cfg) || cfg?.name || cfg?.id || "").toString().trim()
             if (!key) return false
             const lower = key.toLowerCase()
             if (candidates.includes(lower)) return true
@@ -63,7 +66,7 @@ export const useEvaluatorTypeFromConfigs = ({
         // 2) Infer label by scanning evaluator_key/name tokens for known category slugs
         const categorySlugs = Object.keys(EVALUATOR_CATEGORY_LABEL_MAP || {})
         const keyTokens = [
-            (match as any)?.evaluator_key,
+            resolveEvaluatorKey(match),
             (match as any)?.name,
             (evaluator as any)?.key,
             (evaluator as any)?.name,
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts
index fddb7511af..fdbd26e16f 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts
@@ -1,3 +1,4 @@
+import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils"
 import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types"
 
 import {
@@ -126,6 +127,7 @@ export const extractEvaluatorType = (
     }
 
     const candidates = collectEvaluatorCandidates(
+        resolveEvaluatorKey(evaluator as any),
         (evaluator as any)?.slug,
         (evaluator as any)?.key,
         (evaluator as any)?.name,
@@ -290,8 +292,8 @@ export const extractParameterList = (evaluator?: EvaluatorPreviewDto): Parameter
 
     // Support both simple preview artifacts and workflow evaluators
     const parameterSources = [
-        (evaluator as any)?.settings_values,
         (evaluator as any)?.data?.parameters,
+        (evaluator as any)?.settings_values,
         (evaluator as any)?.data?.service?.configuration?.parameters,
         (evaluator as any)?.data?.configuration?.parameters,
     ]
@@ -359,8 +361,8 @@ export const extractModelName = (evaluator?: EvaluatorPreviewDto) => {
     }
 
     const sources = [
-        (evaluator as any)?.settings_values,
         (evaluator as any)?.data?.parameters,
+        (evaluator as any)?.settings_values,
         (evaluator as any)?.data?.service?.configuration,
         (evaluator as any)?.data?.service?.configuration?.parameters,
         (evaluator as any)?.data?.configuration,
@@ -660,7 +662,8 @@ const normalizeMessageContent = (
 export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPreviewSection[] => {
     if (!evaluator) return []
     const data = (evaluator as any)?.data ?? {}
-    const settings = (evaluator as any)?.settings_values
+    const parameters = data?.parameters
+    const settings = parameters ?? (evaluator as any)?.settings_values
     const agConfig = data?.parameters?.ag_config ?? data?.parameters?.agConfig
     const messages =
         findFirstMessages(settings) ??
@@ -728,7 +731,6 @@ export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPr
 
     const promptSources = [
         settings,
-        data?.parameters,
         data?.service?.configuration?.parameters,
         data?.configuration?.parameters,
     ]
diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts
index 74f6f31a51..2bd357fc60 100644
--- a/web/oss/src/lib/Types.ts
+++ b/web/oss/src/lib/Types.ts
@@ -870,6 +870,76 @@ export interface Evaluator {
     archived?: boolean
 }
 
+export interface SimpleEvaluatorData {
+    version?: string
+    uri?: string
+    url?: string
+    headers?: Record<string, string>
+    schemas?: Record<string, any>
+    script?: {content?: string; runtime?: string}
+    parameters?: Record<string, any>
+    service?: Record<string, any>
+    configuration?: Record<string, any>
+}
+
+export interface SimpleEvaluatorFlags {
+    is_custom?: boolean
+    is_evaluator?: boolean
+    is_human?: boolean
+    requires_llm_api_keys?: boolean
+    evaluator_key?: string
+    color?: string
+}
+
+export interface SimpleEvaluator {
+    id: string
+    slug: string
+    name?: string
+    description?: string
+    tags?: string[]
+    meta?: Record<string, any>
+    flags?: SimpleEvaluatorFlags
+    data?: SimpleEvaluatorData
+    created_at?: string
+    updated_at?: string
+    deleted_at?: string | null
+    created_by_id?: string
+    updated_by_id?: string
+    deleted_by_id?: string
+    color?: string
+    icon_url?: string | StaticImageData
+}
+
+export interface SimpleEvaluatorCreate {
+    slug: string
+    name?: string
+    description?: string
+    tags?: string[]
+    meta?: Record<string, any>
+    flags?: SimpleEvaluatorFlags
+    data?: SimpleEvaluatorData
+}
+
+export interface SimpleEvaluatorEdit {
+    id: string
+    name?: string
+    description?: string
+    tags?: string[]
+    meta?: Record<string, any>
+    flags?: SimpleEvaluatorFlags
+    data?: SimpleEvaluatorData
+}
+
+export interface SimpleEvaluatorResponse {
+    count: number
+    evaluator: SimpleEvaluator | null
+}
+
+export interface SimpleEvaluatorsResponse {
+    count: number
+    evaluators: SimpleEvaluator[]
+}
+
 export interface EvaluatorConfig {
     id: string
     evaluator_key: string
diff --git a/web/oss/src/lib/atoms/evaluation.ts b/web/oss/src/lib/atoms/evaluation.ts
index 323dde41cb..1fbc0039ad 100644
--- a/web/oss/src/lib/atoms/evaluation.ts
+++ b/web/oss/src/lib/atoms/evaluation.ts
@@ -1,6 +1,6 @@
 import {atom} from "jotai"
 
-import {Evaluation, EvaluationScenario, Evaluator, EvaluatorConfig} from "../Types"
+import {Evaluation, EvaluationScenario, Evaluator, SimpleEvaluator} from "../Types"
 
 export const evaluationAtom = atom<Evaluation | undefined>(undefined)
 
@@ -8,4 +8,4 @@ export const evaluationScenariosAtom = atom<EvaluationScenario[]>([])
 
 export const evaluatorsAtom = atom<Evaluator[]>([])
 
-export const evaluatorConfigsAtom = atom<EvaluatorConfig[]>([])
+export const evaluatorConfigsAtom = atom<SimpleEvaluator[]>([])
diff --git a/web/oss/src/lib/evaluators/utils.ts b/web/oss/src/lib/evaluators/utils.ts
new file mode 100644
index 0000000000..e21d98a62e
--- /dev/null
+++ b/web/oss/src/lib/evaluators/utils.ts
@@ -0,0 +1,80 @@
+import type {SimpleEvaluator, SimpleEvaluatorData} from "@/oss/lib/Types"
+
+const normalizeSlugBase = (value?: string | null) =>
+    String(value ?? "")
+        .trim()
+        .toLowerCase()
+        .replace(/[^a-z0-9]+/g, "-")
+        .replace(/^-+|-+$/g, "")
+
+const trimVersionSuffix = (value: string) => value.replace(/-v\d+$/i, "")
+
+export const extractEvaluatorKeyFromUri = (uri?: string | null): string | undefined => {
+    if (!uri) return undefined
+    const trimmed = uri.trim()
+    if (!trimmed) return undefined
+
+    const builtinMatch = trimmed.match(/^agenta:builtin:([^:]+)(:|$)/i)
+    if (builtinMatch?.[1]) {
+        return trimVersionSuffix(builtinMatch[1])
+    }
+
+    const parts = trimmed.split(":").filter(Boolean)
+    if (parts.length >= 3 && parts[2]) {
+        return trimVersionSuffix(parts[2])
+    }
+
+    const slashParts = trimmed.split("/").filter(Boolean)
+    const lastSegment = slashParts[slashParts.length - 1]
+    if (lastSegment) {
+        return trimVersionSuffix(lastSegment)
+    }
+
+    return undefined
+}
+
+export const resolveEvaluatorKey = (
+    evaluator?: Partial<SimpleEvaluator> | null,
+): string | undefined => {
+    if (!evaluator) return undefined
+
+    const candidate =
+        extractEvaluatorKeyFromUri(evaluator.data?.uri) ||
+        (typeof (evaluator as any)?.evaluator_key === "string"
+            ? (evaluator as any).evaluator_key
+            : undefined) ||
+        (typeof evaluator.meta?.evaluator_key === "string"
+            ? evaluator.meta.evaluator_key
+            : undefined) ||
+        (typeof evaluator.flags?.evaluator_key === "string"
+            ? evaluator.flags.evaluator_key
+            : undefined) ||
+        (typeof (evaluator as any)?.key === "string" ? (evaluator as any).key : undefined)
+
+    return candidate ? String(candidate).trim() : undefined
+}
+
+export const buildEvaluatorUri = (evaluatorKey: string, version = "v0") =>
+    `agenta:builtin:${evaluatorKey}:${version}`
+
+export const buildEvaluatorSlug = (name?: string | null) => {
+    const base = normalizeSlugBase(name) || "evaluator"
+    const suffix = Math.random().toString(36).slice(2, 8)
+    const maxBaseLength = Math.max(1, 50 - suffix.length - 1)
+    const trimmedBase = base.slice(0, maxBaseLength)
+    return `${trimmedBase}-${suffix}`
+}
+
+export const mergeEvaluatorData = (
+    base?: SimpleEvaluatorData | null,
+    updates?: Partial<SimpleEvaluatorData> | null,
+): SimpleEvaluatorData | undefined => {
+    if (!base && !updates) return undefined
+    return {
+        ...(base ?? {}),
+        ...(updates ?? {}),
+    }
+}
+
+export const getEvaluatorParameters = (evaluator?: Partial<SimpleEvaluator> | null) =>
+    (evaluator?.data?.parameters as Record<string, any>) || {}
diff --git a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts
index 3765eb6677..998f65459e 100644
--- a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts
+++ b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts
@@ -6,11 +6,11 @@ import {SWRConfiguration} from "swr"
 import {useAppId} from "@/oss/hooks/useAppId"
 import {evaluatorConfigsQueryAtomFamily} from "@/oss/state/evaluators"
 
-import {EvaluatorConfig} from "../../Types"
+import {SimpleEvaluator} from "../../Types"
 
 type EvaluatorConfigResult<Preview extends boolean> = Preview extends true
     ? undefined
-    : EvaluatorConfig[]
+    : SimpleEvaluator[]
 
 type EvaluatorConfigsOptions<Preview extends boolean> = {
     preview?: Preview
diff --git a/web/oss/src/services/evaluations/api/index.ts b/web/oss/src/services/evaluations/api/index.ts
index 43bfdb3ca8..9702d501b5 100644
--- a/web/oss/src/services/evaluations/api/index.ts
+++ b/web/oss/src/services/evaluations/api/index.ts
@@ -17,7 +17,7 @@ export {
     createEvaluatorConfig,
     updateEvaluatorConfig,
     deleteEvaluatorConfig,
-    type CreateEvaluationConfigData,
+    type CreateEvaluatorConfigData,
 } from "@/oss/services/evaluators"
 
 //Prefix convention:
diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts
index 2a9bb15de7..9a85ae7c8c 100644
--- a/web/oss/src/services/evaluators/index.ts
+++ b/web/oss/src/services/evaluators/index.ts
@@ -1,9 +1,21 @@
 import axios from "@/oss/lib/api/assets/axiosConfig"
+import {
+    buildEvaluatorSlug,
+    buildEvaluatorUri,
+    resolveEvaluatorKey,
+} from "@/oss/lib/evaluators/utils"
 import {getAgentaApiUrl} from "@/oss/lib/helpers/api"
 import {getTagColors} from "@/oss/lib/helpers/colors"
 import {isDemo, stringToNumberInRange} from "@/oss/lib/helpers/utils"
 import {EvaluatorResponseDto} from "@/oss/lib/hooks/useEvaluators/types"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {
+    Evaluator,
+    SimpleEvaluator,
+    SimpleEvaluatorCreate,
+    SimpleEvaluatorEdit,
+    SimpleEvaluatorResponse,
+    SimpleEvaluatorsResponse,
+} from "@/oss/lib/Types"
 import aiImg from "@/oss/media/artificial-intelligence.png"
 import bracketCurlyImg from "@/oss/media/bracket-curly.png"
 import codeImg from "@/oss/media/browser.png"
@@ -48,7 +60,7 @@ export const updateEvaluator = async (
     }
 }
 
-export const fetchEvaluatorById = async (evaluatorId: string) => {
+export const fetchEvaluatorById = async (evaluatorId: string): Promise<SimpleEvaluator | null> => {
     const {projectId} = getProjectValues()
     if (!projectId) {
         return null
@@ -59,7 +71,7 @@ export const fetchEvaluatorById = async (evaluatorId: string) => {
     )
     const payload = (response?.data as any)?.evaluator ?? response?.data ?? null
     if (!payload) return null
-    return payload as EvaluatorResponseDto<"response">["evaluator"]
+    return decorateSimpleEvaluator(payload as SimpleEvaluator)
 }
 
 const evaluatorIconsMap = {
@@ -103,58 +115,112 @@ export const fetchAllEvaluators = async (includeArchived = false) => {
 }
 
 // Evaluator Configs
+function decorateSimpleEvaluator(evaluator: SimpleEvaluator) {
+    const tagColors = getTagColors()
+    const evaluatorKey = resolveEvaluatorKey(evaluator)
+    if (!evaluatorKey) return evaluator
+
+    return {
+        ...evaluator,
+        icon_url: evaluatorIconsMap[evaluatorKey as keyof typeof evaluatorIconsMap],
+        color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)],
+    }
+}
+
 export const fetchAllEvaluatorConfigs = async (
     appId?: string | null,
     projectIdOverride?: string | null,
-) => {
-    const tagColors = getTagColors()
+): Promise<SimpleEvaluator[]> => {
     const {projectId: projectIdFromStore} = getProjectValues()
     const projectId = projectIdOverride ?? projectIdFromStore
+    void appId
 
     if (!projectId) {
-        return [] as EvaluatorConfig[]
+        return [] as SimpleEvaluator[]
     }
 
-    const response = await axios.get("/evaluators/configs", {
-        params: {
-            project_id: projectId,
-            ...(appId ? {app_id: appId} : {}),
+    const response = await axios.post<SimpleEvaluatorsResponse>(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`,
+        {
+            evaluator: {
+                flags: {
+                    is_evaluator: true,
+                    is_human: false,
+                },
+            },
+            include_archived: false,
         },
-    })
-    const evaluatorConfigs = (response.data || []).map((item: EvaluatorConfig) => ({
-        ...item,
-        icon_url: evaluatorIconsMap[item.evaluator_key as keyof typeof evaluatorIconsMap],
-        color: tagColors[stringToNumberInRange(item.evaluator_key, 0, tagColors.length - 1)],
-    })) as EvaluatorConfig[]
-    return evaluatorConfigs
+    )
+
+    const evaluators = response.data?.evaluators ?? []
+    return evaluators.filter((item) => !item.deleted_at).map(decorateSimpleEvaluator)
+}
+
+export interface CreateEvaluatorConfigData {
+    name: string
+    evaluator_key: string
+    parameters: Record<string, any>
+    tags?: string[]
+    description?: string
 }
 
-export type CreateEvaluationConfigData = Omit<EvaluatorConfig, "id" | "created_at">
 export const createEvaluatorConfig = async (
     _appId: string | null | undefined,
-    config: CreateEvaluationConfigData,
-) => {
+    config: CreateEvaluatorConfigData,
+): Promise<SimpleEvaluator> => {
     const {projectId} = getProjectValues()
     void _appId
 
-    return axios.post(`/evaluators/configs?project_id=${projectId}`, {
-        ...config,
-    })
+    const payload: SimpleEvaluatorCreate = {
+        slug: buildEvaluatorSlug(config.name),
+        name: config.name,
+        description: config.description,
+        tags: config.tags,
+        flags: {is_evaluator: true, is_human: false},
+        data: {
+            uri: buildEvaluatorUri(config.evaluator_key),
+            parameters: config.parameters,
+        },
+    }
+
+    const response = await axios.post<SimpleEvaluatorResponse>(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`,
+        {evaluator: payload},
+    )
+
+    const evaluator = response.data?.evaluator
+    if (!evaluator) {
+        throw new Error("Failed to create evaluator")
+    }
+
+    return decorateSimpleEvaluator(evaluator)
 }
 
 export const updateEvaluatorConfig = async (
     configId: string,
-    config: Partial<CreateEvaluationConfigData>,
-) => {
+    config: SimpleEvaluatorEdit,
+): Promise<SimpleEvaluator> => {
     const {projectId} = getProjectValues()
 
-    return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config)
+    const response = await axios.put<SimpleEvaluatorResponse>(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`,
+        {evaluator: {...config, id: configId}},
+    )
+
+    const evaluator = response.data?.evaluator
+    if (!evaluator) {
+        throw new Error("Failed to update evaluator")
+    }
+
+    return decorateSimpleEvaluator(evaluator)
 }
 
 export const deleteEvaluatorConfig = async (configId: string) => {
     const {projectId} = getProjectValues()
 
-    return axios.delete(`/evaluators/configs/${configId}?project_id=${projectId}`)
+    return axios.post(
+        `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`,
+    )
 }
 
 export const deleteHumanEvaluator = async (evaluatorId: string) => {
diff --git a/web/oss/src/state/evaluators/atoms.ts b/web/oss/src/state/evaluators/atoms.ts
index 24f390e884..5c6b34c9e2 100644
--- a/web/oss/src/state/evaluators/atoms.ts
+++ b/web/oss/src/state/evaluators/atoms.ts
@@ -5,6 +5,7 @@ import {atomWithQuery} from "jotai-tanstack-query"
 import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms"
 import axios from "@/oss/lib/api/assets/axiosConfig"
 import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation"
+import {extractEvaluatorKeyFromUri} from "@/oss/lib/evaluators/utils"
 import {transformApiData} from "@/oss/lib/hooks/useAnnotations/assets/transformer"
 import {
     EvaluatorDto,
@@ -13,7 +14,7 @@ import {
     EvaluatorRevisionsResponseDto,
     EvaluatorsResponseDto,
 } from "@/oss/lib/hooks/useEvaluators/types"
-import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types"
+import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types"
 import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/oss/services/evaluators"
 import {selectedAppIdAtom} from "@/oss/state/app"
 import {selectedOrgAtom} from "@/oss/state/org"
@@ -26,16 +27,15 @@ import {EvaluatorConfigsParams, EvaluatorsParams} from "./types"
 
 const extractKeyFromUri = (uri: unknown): string | undefined => {
     if (typeof uri !== "string") return undefined
-    const match = uri.match(/[:/](auto_[a-z0-9_]+)/i)
-    if (match?.[1]) return match[1]
-    const parts = uri.split(":").filter(Boolean)
-    if (parts.length) {
-        const candidate = parts[parts.length - 1]
-        if (candidate) {
-            return candidate.replace(/-v\d+$/i, "")
-        }
-    }
-    return undefined
+    return (
+        extractEvaluatorKeyFromUri(uri) ||
+        uri.match(/[:/](auto_[a-z0-9_]+)/i)?.[1] ||
+        uri
+            .split(":")
+            .filter(Boolean)
+            .slice(-1)[0]
+            ?.replace(/-v\d+$/i, "")
+    )
 }
 
 const isPlainObject = (value: unknown): value is Record<string, any> => {
@@ -102,7 +102,7 @@ const extractRequiresLlmApiKeys = (source: unknown): boolean | undefined => {
 
 export const evaluatorConfigsQueryAtomFamily = atomFamily(
     ({projectId: overrideProjectId, appId: overrideAppId, preview}: EvaluatorConfigsParams = {}) =>
-        atomWithQuery<EvaluatorConfig[]>((get) => {
+        atomWithQuery<SimpleEvaluator[]>((get) => {
             const projectId = overrideProjectId || get(projectIdAtom)
             const appId = overrideAppId || get(selectedAppIdAtom)
             const user = get(userAtom) as {id?: string} | null

From 02ad4dcbea427c445023b644cf73b22194763ba6 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 28 Jan 2026 13:00:57 +0100
Subject: [PATCH 4/4] fix(frontend): remove duplicate hook imports

---
 .../evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts    | 1 -
 .../evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx | 1 -
 .../onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts        | 1 -
 3 files changed, 3 deletions(-)

diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
index 42612e4322..0545163cae 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts
@@ -1,5 +1,4 @@
 import {useMemo} from "react"
-import {useMemo} from "react"
 
 import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types"
 
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
index c1cf18acd2..d5e724dcef 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx
@@ -1,5 +1,4 @@
 import {useMemo} from "react"
-import {useMemo} from "react"
 
 import {SelectProps} from "antd"
 
diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
index 9cab865352..3cef385d5d 100644
--- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
+++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts
@@ -1,5 +1,4 @@
 import {useMemo} from "react"
-import {useMemo} from "react"
 
 import {useAtomValue} from "jotai"