diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md new file mode 100644 index 0000000000..b0b9d0c319 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/README.md @@ -0,0 +1,84 @@ +# Migrate Evaluator Playground to New Evaluator Endpoints + +## Overview + +This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model. + +## Migration Strategy + +**Direct migration (no adapters)** split into two PRs: + +| PR | Scope | Description | +|----|-------|-------------| +| **PR 1** | CRUD | Migrate to `/preview/simple/evaluators/*`, change internal types to `SimpleEvaluator` | +| **PR 2** | Run | Migrate to `/preview/workflows/invoke`, add workflow service types | + +See [plan.md](./plan.md) for detailed implementation steps. + +## Context + +- **PR #3527**: Backend migration that introduces new evaluator endpoints +- **Goal**: Full migration to new endpoints, no legacy code remaining + +## Documents + +| File | Description | +|------|-------------| +| [context.md](./context.md) | Background, motivation, problem statement, goals, and non-goals | +| [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation | +| [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy | +| [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers | +| [migration-options.md](./migration-options.md) | Why we chose direct migration over adapters | +| [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration | +| [plan.md](./plan.md) | **Main plan** - PR 1 (CRUD) and PR 2 (Run) implementation details | +| [status.md](./status.md) | Living document for progress updates and decisions | + +## Key Mapping Changes + +| Legacy | New | +|--------|-----| +| `EvaluatorConfig` | `SimpleEvaluator` | +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` | +| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` | +| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` | +| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` | +| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` | + +## Files Affected + +### PR 1: CRUD Migration + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` | +| Services | `web/oss/src/services/evaluators/index.ts` | +| State | `web/oss/src/state/evaluators/atoms.ts` | +| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` | +| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` | +| Registry | `web/oss/src/components/Evaluators/index.tsx` | +| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` | +| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` | + +### PR 2: Run Migration + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` (add workflow types) | +| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) | +| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` | + +### Backend Reference (PR #3527) +- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept temporarily) +- `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router +- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint +- `api/oss/src/core/evaluators/dtos.py` - New data transfer objects + +## Effort Estimate + +| PR | Effort | +|----|--------| +| PR 1: CRUD | 4-5 days | +| PR 2: Run | 3-4 days | +| **Total** | **7-9 days** | diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md new file mode 100644 index 0000000000..5fa82e8b21 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/context.md @@ -0,0 +1,72 @@ +# Context: Migrate Evaluator Playground + +## Background + +The Agenta platform has undergone a significant architectural change where **evaluators are now workflows**. This means evaluators follow the same git-like versioning model as other workflows: +- **Artifact** (Evaluator) → **Variant** → **Revision** + +Previously, evaluators were stored in a flat `EvaluatorConfigDB` table with simple key-value settings. The new model stores evaluators as `WorkflowArtifactDBE`, `WorkflowVariantDBE`, and `WorkflowRevisionDBE` records with richer metadata and versioning. + +## Motivation + +1. **Unified Architecture**: Evaluators, testsets, and apps now share the same git-like workflow model +2. **Better Versioning**: Evaluators can have multiple variants and revision history +3. **Richer Metadata**: New model supports URIs, schemas, scripts, and configuration in a structured way +4. **Future Extensibility**: Custom evaluators will be first-class citizens with the same capabilities as built-in ones + +## Problem Statement + +The Evaluator Playground frontend currently uses legacy endpoints: +- `GET /evaluators/` - List evaluator templates +- `GET/POST/PUT/DELETE /evaluators/configs/` - CRUD for evaluator configurations +- `POST /evaluators/{key}/run/` - Run evaluator in playground + +The backend (PR #3527) has: +1. Migrated all evaluator configs to the new workflow-based model via DB migrations +2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/` +3. Native workflow execution available at `/preview/workflows/invoke` +4. Kept legacy endpoints as thin wrappers (to be deprecated) + +**The frontend needs to migrate to use the new endpoints directly.** + +## Goals + +1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints +2. **Replace legacy evaluator run** with native workflow invoke (`/preview/workflows/invoke`) +3. **Update data models** in frontend to match new `SimpleEvaluator` shape (no adapters) +4. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality +5. **Remove all legacy endpoint usage** - clean migration, no dual-path code + +## Non-Goals + +1. **Not changing the Evaluator Playground UI** - Only the data layer changes +2. **Not migrating evaluation batch runs** - Those already use the new workflow system internally +3. **Not introducing new evaluator features** - This is a pure endpoint migration + +## Success Criteria + +1. Evaluator Playground can create, edit, delete evaluators using new `SimpleEvaluator` endpoints +2. Evaluator Playground can run evaluators using native workflow invoke +3. All existing evaluator configurations continue to work +4. No regression in evaluator testing functionality +5. No legacy endpoint calls remain in frontend code + +## Constraints + +1. Must not break existing evaluator configurations +2. Must coordinate with backend team on endpoint availability (PR #3527) +3. Split into two PRs for reviewability (CRUD first, then Run) + +## Migration Approach + +**Direct migration (no adapters):** + +| PR | Scope | Endpoints | +|----|-------|-----------| +| PR 1 | CRUD | `/preview/simple/evaluators/*` | +| PR 2 | Run | `/preview/workflows/invoke` | + +This approach: +- Avoids tech debt from adapter layers +- Aligns internal types with backend models +- Keeps changes reviewable by splitting into two PRs diff --git a/docs/design/migrate-evaluator-playground/current-system.md b/docs/design/migrate-evaluator-playground/current-system.md new file mode 100644 index 0000000000..7797d76ec4 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/current-system.md @@ -0,0 +1,230 @@ +# Current System: Evaluator Playground + +## Overview + +The Evaluator Playground allows users to: +1. **Browse** evaluator templates (built-in evaluators) +2. **Create/Configure** evaluator configurations with custom settings +3. **Test** evaluators by running them against app variants and test cases +4. **Manage** (edit, clone, delete) existing evaluator configurations + +## File Structure + +### Entry Points (Pages) + +| Path | Purpose | +|------|---------| +| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/index.tsx` | Evaluators list page | +| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/configure/[evaluator_id].tsx` | Configure evaluator page | + +### Core Components + +#### Evaluators Registry (`/web/oss/src/components/Evaluators/`) + +| File | Purpose | +|------|---------| +| `index.tsx` | Main registry with table, search, tabs (automatic/human) | +| `hooks/useEvaluatorsRegistryData.ts` | Fetches and transforms evaluator data | +| `assets/getColumns.tsx` | Table column definitions | +| `components/SelectEvaluatorModal/` | Modal to select evaluator template for new config | +| `components/ConfigureEvaluator/index.tsx` | Page wrapper that loads data and initializes atoms | +| `components/DeleteEvaluatorsModal/` | Delete confirmation modal | + +#### ConfigureEvaluator (Main UI) + +Location: `/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` + +| File | Purpose | +|------|---------| +| `index.tsx` | Configuration form + test panel layout | +| `DebugSection.tsx` | Test evaluator panel (run variant, run evaluator) | +| `DynamicFormField.tsx` | Renders settings fields based on evaluator template | +| `AdvancedSettings.tsx` | Collapsible advanced parameters | +| `state/atoms.ts` | Jotai atoms for playground state | +| `variantUtils.ts` | Utility for building variants from revisions | + +### State Management + +#### Playground Atoms (`state/atoms.ts`) + +```typescript +// Session state +playgroundSessionAtom // { evaluator, existingConfigId, mode } +playgroundEvaluatorAtom // Current evaluator template (derived) +playgroundIsEditModeAtom // Is editing existing config? (derived) +playgroundIsCloneModeAtom // Is cloning config? (derived) +playgroundEditValuesAtom // Current config values being edited + +// Form state +playgroundFormRefAtom // Ant Design Form instance + +// Test section state +playgroundSelectedVariantAtom // Selected variant for testing +playgroundSelectedTestsetIdAtom // Selected testset ID +playgroundSelectedRevisionIdAtom // Selected revision ID +playgroundSelectedTestcaseAtom // Testcase data +playgroundTraceTreeAtom // Trace output from running variant + +// Persisted state (localStorage) +playgroundLastAppIdAtom // Last used app ID +playgroundLastVariantIdAtom // Last used variant ID + +// Action atoms +initPlaygroundAtom // Initialize playground state +resetPlaygroundAtom // Reset all state +commitPlaygroundAtom // Update state after save +cloneCurrentConfigAtom // Switch to clone mode +``` + +#### Global Evaluator Atoms (`/web/oss/src/state/evaluators/atoms.ts`) + +```typescript +evaluatorConfigsQueryAtomFamily // Query for evaluator configs +evaluatorsQueryAtomFamily // Query for evaluator templates +nonArchivedEvaluatorsAtom // Derived: non-archived evaluators +evaluatorByKeyAtomFamily // Find evaluator by key +``` + +### API Service Layer + +#### Evaluators Service (`/web/oss/src/services/evaluators/index.ts`) + +```typescript +// Evaluator Templates (legacy) +fetchAllEvaluators() // GET /evaluators + +// Evaluator Configs (legacy) +fetchAllEvaluatorConfigs() // GET /evaluators/configs +createEvaluatorConfig() // POST /evaluators/configs +updateEvaluatorConfig() // PUT /evaluators/configs/{id} +deleteEvaluatorConfig() // DELETE /evaluators/configs/{id} + +// Custom/Human Evaluators (new) +createEvaluator() // POST /preview/simple/evaluators/ +updateEvaluator() // PUT /preview/simple/evaluators/{id} +fetchEvaluatorById() // GET /preview/simple/evaluators/{id} +deleteHumanEvaluator() // POST /preview/simple/evaluators/{id}/archive +``` + +#### Evaluator Run Service (`/web/oss/src/services/evaluations/api_ee/index.ts`) + +```typescript +createEvaluatorDataMapping() // POST /evaluators/map +createEvaluatorRunExecution() // POST /evaluators/{key}/run +``` + +## Data Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ USER ACTIONS │ +│ - Browse evaluators list │ +│ - Create new evaluator config │ +│ - Edit existing evaluator config │ +│ - Test evaluator with variant + testcase │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ENTRY POINTS │ +│ /evaluators → EvaluatorsRegistry │ +│ ├─ Uses useEvaluatorsRegistryData() hook │ +│ │ ├─ Calls fetchAllEvaluators() → GET /evaluators │ +│ │ └─ Calls fetchAllEvaluatorConfigs() → GET /evaluators/configs │ +│ │ │ +│ ├─ "Create new" → SelectEvaluatorModal → /evaluators/configure/new │ +│ └─ Click row → /evaluators/configure/{id} │ +│ │ +│ /evaluators/configure/{id} → ConfigureEvaluatorPage │ +│ ├─ Loads evaluator template & existing config │ +│ ├─ Initializes playgroundSessionAtom │ +│ └─ Renders ConfigureEvaluator component │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ConfigureEvaluator │ +│ ┌─────────────────────────────┐ ┌─────────────────────────────┐ │ +│ │ LEFT: Configuration Form │ │ RIGHT: DebugSection │ │ +│ │ - Name input │ │ - Testcase selector │ │ +│ │ - DynamicFormField[] │ │ - Variant selector │ │ +│ │ - AdvancedSettings │ │ - Run variant button │ │ +│ │ - Commit/Reset buttons │ │ - Run evaluator button │ │ +│ └─────────────────────────────┘ └─────────────────────────────┘ │ +│ │ +│ Commit Actions: │ +│ - Create: POST /evaluators/configs → createEvaluatorConfig() │ +│ - Update: PUT /evaluators/configs/{id} → updateEvaluatorConfig() │ +│ │ +│ Test Actions: │ +│ - Run Variant: callVariant() → POST to variant URL │ +│ - Run Evaluator: createEvaluatorRunExecution() │ +│ → POST /evaluators/{key}/run │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Current API Endpoints Used + +### Legacy Endpoints (to be migrated) + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/evaluators/` | GET | `fetchAllEvaluators()` | List evaluator templates | +| `/evaluators/configs/` | GET | `fetchAllEvaluatorConfigs()` | List evaluator configs | +| `/evaluators/configs/` | POST | `createEvaluatorConfig()` | Create new config | +| `/evaluators/configs/{id}/` | PUT | `updateEvaluatorConfig()` | Update existing config | +| `/evaluators/configs/{id}/` | DELETE | `deleteEvaluatorConfig()` | Delete config | + +### Endpoints That Remain Unchanged + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/evaluators/map/` | POST | `createEvaluatorDataMapping()` | Map trace data for RAG evaluators | +| `/evaluators/{key}/run/` | POST | `createEvaluatorRunExecution()` | Run evaluator (test) | + +### Already Using New Endpoints (for custom evaluators) + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/preview/simple/evaluators/` | POST | `createEvaluator()` | Create custom evaluator | +| `/preview/simple/evaluators/{id}` | PUT | `updateEvaluator()` | Update custom evaluator | +| `/preview/simple/evaluators/{id}` | GET | `fetchEvaluatorById()` | Fetch evaluator by ID | +| `/preview/simple/evaluators/{id}/archive` | POST | `deleteHumanEvaluator()` | Archive human evaluator | + +## Data Types + +### Current EvaluatorConfig (Legacy) + +```typescript +interface EvaluatorConfig { + id: string + evaluator_key: string + name: string + settings_values: Record + created_at: string + updated_at: string + color?: string + tags?: string[] + // Frontend additions + icon_url?: string | StaticImageData +} +``` + +### Current Evaluator Template (Legacy) + +```typescript +interface Evaluator { + name: string + key: string + settings_presets?: SettingsPreset[] + settings_template: Record + icon_url?: string | StaticImageData + color?: string + direct_use?: boolean + description: string + oss?: boolean + requires_llm_api_keys?: boolean + tags: string[] + archived?: boolean +} +``` diff --git a/docs/design/migrate-evaluator-playground/migration-options.md b/docs/design/migrate-evaluator-playground/migration-options.md new file mode 100644 index 0000000000..40bf6b4caa --- /dev/null +++ b/docs/design/migrate-evaluator-playground/migration-options.md @@ -0,0 +1,106 @@ +# Migration Options + +## Goal + +Full migration of the Evaluator Playground to the new workflow-based evaluator APIs, including: +- CRUD on evaluator configs via `/preview/simple/evaluators/*` +- Running evaluators via native workflow invocation (`/preview/workflows/invoke`) instead of the legacy `/evaluators/{key}/run` + +--- + +## Option A (Rejected): Adapter Pattern + +Keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary. + +### Why it was considered + +- Minimizes touching UI/atoms/forms initially +- Lets you swap endpoints quickly with limited regression surface +- Good when backend is still stabilizing schemas + +### Why it was rejected + +- Adds tech debt (adapter layer becomes permanent) +- Delays alignment with new architecture +- Makes future changes harder (two mental models) + +--- + +## Option B (Chosen): Direct Migration + +Change the frontend domain model to match the backend: +- "Evaluator config" becomes `SimpleEvaluator` +- Internal shapes use `data.parameters` instead of `settings_values` +- Internal shapes derive `evaluator_key` from `data.uri` + +### Why it's better + +- No translation debt +- Aligns with "evaluators are workflows" concept end-to-end +- Unlocks revision-aware runs and custom evaluator URIs +- Cleaner codebase long-term + +--- + +## Execution Strategy + +To keep changes reviewable while avoiding adapters: + +### PR 1: CRUD Migration +- Migrate all CRUD operations to `/preview/simple/evaluators/*` +- Change internal types from `EvaluatorConfig` to `SimpleEvaluator` +- Update atoms, services, and components +- Keep legacy run endpoint temporarily + +### PR 2: Run Migration +- Migrate run from `/evaluators/{key}/run` to `/preview/workflows/invoke` +- Add `WorkflowServiceRequest/Response` types +- Update `DebugSection.tsx` to use native invoke + +This sequencing: +1. Isolates CRUD changes for easier review +2. Allows CRUD to stabilize before changing run +3. Avoids adapter layer entirely +4. Results in full migration with no legacy code + +--- + +## Files Affected + +### PR 1 (CRUD) + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` | +| Services | `web/oss/src/services/evaluators/index.ts` | +| State | `web/oss/src/state/evaluators/atoms.ts` | +| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` | +| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` | +| Registry | `web/oss/src/components/Evaluators/index.tsx` | +| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` | +| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` | + +### PR 2 (Run) + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` (add workflow types) | +| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) | +| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` | + +--- + +## Key Mapping Changes + +| Legacy | New | +|--------|-----| +| `EvaluatorConfig` | `SimpleEvaluator` | +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` | +| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` | +| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` | +| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` | +| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` | + +See [plan.md](./plan.md) for detailed implementation steps. diff --git a/docs/design/migrate-evaluator-playground/new-endpoints.md b/docs/design/migrate-evaluator-playground/new-endpoints.md new file mode 100644 index 0000000000..97a20f01b2 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/new-endpoints.md @@ -0,0 +1,434 @@ +# New Evaluator Endpoints + +## Overview + +The new evaluator system treats evaluators as **workflows** with git-like versioning. The `SimpleEvaluator` API provides a simplified interface that abstracts the underlying workflow structure. + +## Key Architectural Change + +**Evaluators are now workflows identified by URIs.** + +URI Format: `agenta:builtin:{evaluator_key}:v0` + +Example: `agenta:builtin:auto_exact_match:v0` + +The SDK has a `HANDLER_REGISTRY` that maps URIs to actual handler functions. This enables: +- Native workflow invocation via URI +- Custom evaluators with user-defined URIs (`user:custom:my_evaluator:latest`) +- Version management of evaluator implementations + +## Evaluator Execution Paths + +### Option 1: Legacy Run Endpoint (Maintained for Backward Compatibility) + +``` +POST /evaluators/{evaluator_key}/run/ +``` + +**Request:** +```typescript +interface EvaluatorInputInterface { + inputs: Record // prediction, ground_truth, etc. + settings: Record // evaluator configuration + credentials?: Record +} +``` + +**Response:** +```typescript +interface EvaluatorOutputInterface { + outputs: Record // score, success, etc. +} +``` + +**Internal Implementation (PR #3527):** +```python +async def _run_evaluator(evaluator_key: str, evaluator_input): + # Build URI from evaluator_key + uri = f"agenta:builtin:{evaluator_key}:v0" + + # Retrieve handler from SDK registry + handler = retrieve_handler(uri) + + # Invoke handler directly + result = handler(inputs=inputs, outputs=outputs, parameters=settings) + + return {"outputs": result} +``` + +### Option 2: Native Workflow Invoke Endpoint + +``` +POST /preview/workflows/invoke +``` + +**Request:** +```typescript +interface WorkflowServiceRequest { + data: { + inputs: Record + outputs?: any + parameters?: Record // settings + } + revision?: { + data?: { + uri: string // e.g., "agenta:builtin:auto_exact_match:v0" + parameters?: Record + } + } +} +``` + +**Response:** +```typescript +interface WorkflowServiceBatchResponse { + data: { + outputs: Record + } + status?: { + code: number + message: string + } +} +``` + +### Option 3: Evaluator Revision-Based Invoke + +For a fully "native" approach: + +1. **Fetch the evaluator revision:** + ``` + POST /preview/evaluators/revisions/retrieve + ``` + +2. **Get the URI from revision data:** + ```typescript + const uri = evaluatorRevision.data.uri // "agenta:builtin:auto_exact_match:v0" + ``` + +3. **Invoke via workflow service:** + ``` + POST /preview/workflows/invoke + ``` + +## Comparison: Which Approach to Use? + +| Aspect | Legacy Run | Native Invoke | Revision-Based | +|--------|------------|---------------|----------------| +| **Simplicity** | High | Medium | Low | +| **Frontend Changes** | Minimal | Medium | Significant | +| **Architecture Alignment** | Legacy | Native | Most Native | +| **Flexibility** | Low | High | High | +| **Custom Evaluators** | Limited | Full Support | Full Support | +| **Requires URI** | No (uses key) | Yes | Yes (fetched) | + +**Recommendation:** + +For the Evaluator Playground migration: +- **Short-term:** Keep using legacy `/evaluators/{key}/run/` - it works the same and the backend handles URI resolution internally +- **Long-term:** Consider migrating to native workflow invoke when supporting custom evaluators or revision-specific execution + +--- + +## New SimpleEvaluator CRUD Endpoints + +Base path: `/preview/simple/evaluators/` + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/preview/simple/evaluators/` | POST | Create new evaluator | +| `/preview/simple/evaluators/{id}` | GET | Fetch evaluator by ID | +| `/preview/simple/evaluators/{id}` | PUT | Update evaluator | +| `/preview/simple/evaluators/{id}/archive` | POST | Archive (soft delete) evaluator | +| `/preview/simple/evaluators/{id}/unarchive` | POST | Restore archived evaluator | +| `/preview/simple/evaluators/query` | POST | Query evaluators with filters | + +## Data Structures + +### SimpleEvaluator (Response) + +```python +class SimpleEvaluator: + id: UUID + slug: str + + # Lifecycle + created_at: datetime + updated_at: datetime + + # Header + name: Optional[str] + description: Optional[str] + + # Metadata + tags: Optional[List[str]] + meta: Optional[dict] + + # Flags + flags: Optional[SimpleEvaluatorFlags] + + # Data (revision data) + data: Optional[SimpleEvaluatorData] +``` + +### SimpleEvaluatorData (Revision Configuration) + +```python +class SimpleEvaluatorData: + # Version + version: Optional[str] # e.g., "2025.07.14" + + # Service Interface - THE KEY FIELD + uri: Optional[str] # e.g., "agenta:builtin:auto_exact_match:v0" + url: Optional[str] # For webhook evaluators + headers: Optional[Dict[str, Union[Reference, str]]] + + # Schema definitions + schemas: Optional[Dict[str, Schema]] # e.g., {"outputs": {...}} + + # Configuration + script: Optional[dict] # For custom code: {"content": "...", "runtime": "python"} + parameters: Optional[dict] # Settings values (same as legacy settings_values) + + # Legacy fields (for backward compatibility) + service: Optional[dict] + configuration: Optional[dict] +``` + +### URI-based Handler Registry + +The SDK maintains registries that map URIs to implementations: + +```python +HANDLER_REGISTRY = { + "agenta": { + "builtin": { + "echo": {"v0": echo_v0}, + "auto_exact_match": {"v0": auto_exact_match_v0}, + "auto_regex_test": {"v0": auto_regex_test_v0}, + # ... all built-in evaluators + } + }, + "user": { + "custom": { + # User-defined evaluators go here + } + } +} +``` + +Retrieve handler by URI: +```python +handler = retrieve_handler("agenta:builtin:auto_exact_match:v0") +``` + +--- + +## Endpoint Comparison: Old vs New (CRUD) + +### List Evaluator Configs + +**Old:** +``` +GET /evaluators/configs/?project_id={project_id} + +Response: EvaluatorConfig[] +{ + id: string + name: string + evaluator_key: string + settings_values: object + created_at: string + updated_at: string +} +``` + +**New:** +``` +POST /preview/simple/evaluators/query?project_id={project_id} + +Request: SimpleEvaluatorQuery +{ + flags?: { is_evaluator: true } +} + +Response: SimpleEvaluatorsResponse +{ + count: number + evaluators: SimpleEvaluator[] +} +``` + +**Note:** For the Evaluator Registry (automatic configs), pass `flags.is_human = false` and `include_archived = false` so archived or human evaluators don't show up. + +### Create Evaluator Config + +**Old:** +``` +POST /evaluators/configs/?project_id={project_id} + +Request: NewEvaluatorConfig +{ + name: string + evaluator_key: string + settings_values: object +} + +Response: EvaluatorConfig +``` + +**New:** +``` +POST /preview/simple/evaluators/?project_id={project_id} + +Request: SimpleEvaluatorCreateRequest +{ + evaluator: { + slug: string # Generated from name + name: string + flags: { is_evaluator: true, is_human: false } + data: { + uri: "agenta:builtin:{evaluator_key}:v0" + parameters: object # settings_values + schemas: { outputs: object } # Output schema + } + } +} + +Response: SimpleEvaluatorResponse +{ + count: number + evaluator: SimpleEvaluator +} +``` + +**Note:** Workflow slugs are unique per project. We append a short random suffix when generating slugs to avoid collisions when names repeat. + +### Update Evaluator Config + +**Old:** +``` +PUT /evaluators/configs/{id}/?project_id={project_id} + +Request: UpdateEvaluatorConfig +{ + name?: string + settings_values?: object +} + +Response: EvaluatorConfig +``` + +**New:** +``` +PUT /preview/simple/evaluators/{id}?project_id={project_id} + +Request: SimpleEvaluatorEditRequest +{ + evaluator: { + id: UUID + name?: string + data?: { + parameters?: object # settings_values + } + } +} + +Response: SimpleEvaluatorResponse +``` + +**Note:** `SimpleEvaluatorEdit.data` is treated as the full revision payload. When updating, include the existing `data.uri` (and any schemas) along with `data.parameters` to avoid clearing the URI. + +### Delete Evaluator Config + +**Old:** +``` +DELETE /evaluators/configs/{id}/?project_id={project_id} + +Response: boolean +``` + +**New:** +``` +POST /preview/simple/evaluators/{id}/archive?project_id={project_id} + +Response: SimpleEvaluatorResponse +``` + +--- + +## Key Differences Summary + +### 1. URI-based Evaluator Identification + +**Old:** `evaluator_key: "auto_exact_match"` + +**New:** `uri: "agenta:builtin:auto_exact_match:v0"` + +The URI enables: +- Version management (`v0`, `v1`, etc.) +- Custom evaluators (`user:custom:my_eval:latest`) +- Handler registry lookup + +### 2. Settings Location + +**Old:** `settings_values: { threshold: 0.5 }` + +**New:** `data.parameters: { threshold: 0.5 }` + +### 3. Output Schema (New) + +The new model includes explicit output schemas: + +```python +data.schemas = { + "outputs": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "score": {"type": "number"}, + "success": {"type": "boolean"} + } + } +} +``` + +### 4. Soft Delete vs Hard Delete + +- **Old:** Hard delete (`DELETE`) +- **New:** Soft delete via archive (`POST .../archive`) + +### 5. Response Wrapper + +**Old:** Returns data directly + +**New:** Returns wrapped response: `{ count: number, evaluator: SimpleEvaluator }` + +--- + +## Frontend Mapping Requirements + +To migrate, the frontend needs to: + +1. **When creating an evaluator:** + - Generate `slug` from name + - Build `uri` from `evaluator_key`: `"agenta:builtin:{evaluator_key}:v0"` + - Move `settings_values` to `data.parameters` + - Set `flags.is_evaluator = true` + - Optionally include `data.schemas.outputs` + +2. **When reading evaluators:** + - Extract `evaluator_key` from `uri` (parse the third segment) + - Read settings from `data.parameters` + - Unwrap response from `{ evaluator: ... }` + +3. **When updating:** + - Include `id` in request body + - Update `data.parameters` for settings changes + +4. **When deleting:** + - Use `POST .../archive` instead of `DELETE` + +5. **When running evaluators:** + - **Option A (Recommended):** Keep using `/evaluators/{key}/run/` - no change needed + - **Option B (Native):** Use `/preview/workflows/invoke` with URI from revision diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md new file mode 100644 index 0000000000..8a384658f9 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/plan.md @@ -0,0 +1,734 @@ +# Migration Plan: Evaluator Playground + +## Overview + +Full migration of the Evaluator Playground to the new workflow-based evaluator APIs. This plan follows **Plan B (Direct Migration)** - no adapters, internal shapes change to match the new `SimpleEvaluator` model. + +## Migration Strategy + +**Two PRs, no adapters:** + +1. **PR 1:** Migrate CRUD to `SimpleEvaluator` endpoints (internal shapes change) +2. **PR 2:** Migrate run to native workflow invoke (`/preview/workflows/invoke`) + +This keeps changes reviewable while avoiding tech debt from adapter layers. + +``` +PR 1: CRUD Migration +┌─────────────────────────────────────────────────────────────────┐ +│ EvaluatorConfig → SimpleEvaluator │ +│ /evaluators/configs/* → /preview/simple/evaluators/* │ +│ settings_values → data.parameters │ +│ evaluator_key → data.uri │ +└─────────────────────────────────────────────────────────────────┘ + +PR 2: Run Migration +┌─────────────────────────────────────────────────────────────────┐ +│ /evaluators/{key}/run → /preview/workflows/invoke │ +│ EvaluatorInputInterface → WorkflowServiceRequest │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## PR 1: CRUD Migration + +**Goal:** Replace legacy evaluator config endpoints with new SimpleEvaluator endpoints. Change internal data model from `EvaluatorConfig` to `SimpleEvaluator`. + +### Phase 1.1: Type Definitions + +**File:** `web/oss/src/lib/Types.ts` (add to existing types) + +```typescript +// ============ SimpleEvaluator Types ============ + +export interface SimpleEvaluatorData { + version?: string + uri?: string // e.g., "agenta:builtin:auto_exact_match:v0" + url?: string // for webhook evaluators + headers?: Record + schemas?: { + outputs?: Record + inputs?: Record + parameters?: Record + } + script?: { content: string; runtime: string } + parameters?: Record // replaces settings_values +} + +export interface SimpleEvaluatorFlags { + is_custom?: boolean + is_evaluator?: boolean + is_human?: boolean +} + +export interface SimpleEvaluator { + id: string + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData + created_at: string + updated_at: string +} + +export interface SimpleEvaluatorCreate { + slug: string + name?: string + description?: string + tags?: string[] + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorEdit { + id: string + name?: string + description?: string + tags?: string[] + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorResponse { + count: number + evaluator: SimpleEvaluator | null +} + +export interface SimpleEvaluatorsResponse { + count: number + evaluators: SimpleEvaluator[] +} +``` + +**Deliverables:** +- [ ] Add `SimpleEvaluator*` types to Types.ts +- [ ] Keep `EvaluatorConfig` temporarily for areas not yet migrated + +--- + +### Phase 1.2: Service Layer Changes + +**File:** `web/oss/src/services/evaluators/index.ts` + +Replace legacy functions with new implementations: + +```typescript +// ============ Helper Functions ============ + +/** + * Extract evaluator_key from URI + * URI format: "agenta:builtin:{key}:v0" + */ +export function extractEvaluatorKeyFromUri(uri: string | undefined): string { + if (!uri) return "" + const parts = uri.split(":") + if (parts.length >= 3 && parts[0] === "agenta" && parts[1] === "builtin") { + return parts[2] + } + return "" +} + +/** + * Build URI from evaluator key + */ +export function buildEvaluatorUri(evaluatorKey: string): string { + return `agenta:builtin:${evaluatorKey}:v0` +} + +/** + * Generate slug from name (append suffix to avoid collisions) + */ +export function generateSlug(name: string): string { + const base = name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, "") + + const suffix = Math.random().toString(36).slice(2, 8) + const maxBaseLength = Math.max(1, 50 - suffix.length - 1) + return `${base.slice(0, maxBaseLength)}-${suffix}` +} + +// ============ CRUD Functions ============ + +export const fetchAllEvaluatorConfigs = async ( + _appId?: string | null, // kept for backward compat, ignored + projectIdOverride?: string | null, +): Promise => { + const {projectId: projectIdFromStore} = getProjectValues() + const projectId = projectIdOverride ?? projectIdFromStore + + if (!projectId) return [] + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, + { + evaluator: { flags: { is_evaluator: true, is_human: false } }, + include_archived: false, + } + ) + + return response.data?.evaluators || [] +} + +export const createEvaluatorConfig = async ( + evaluatorKey: string, + name: string, + settingsValues: Record, +): Promise => { + const {projectId} = getProjectValues() + + const payload: SimpleEvaluatorCreate = { + slug: generateSlug(name), + name, + flags: { is_evaluator: true, is_human: false }, + data: { + uri: buildEvaluatorUri(evaluatorKey), + parameters: settingsValues, + }, + } + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`, + { evaluator: payload }, + ) + + const result = response.data?.evaluator + if (!result) throw new Error("Failed to create evaluator") + + return result +} + +export const updateEvaluatorConfig = async ( + evaluatorId: string, + updates: { name?: string; settingsValues?: Record }, + existing?: SimpleEvaluator, +): Promise => { + const {projectId} = getProjectValues() + + // IMPORTANT: include existing data (uri/schemas) when editing + const payload: SimpleEvaluatorEdit = { + id: evaluatorId, + name: updates.name ?? existing?.name, + data: { + ...(existing?.data ?? {}), + ...(updates.settingsValues ? {parameters: updates.settingsValues} : {}), + }, + tags: existing?.tags, + meta: existing?.meta, + flags: existing?.flags, + } + + const response = await axios.put( + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`, + { evaluator: payload }, + ) + + const result = response.data?.evaluator + if (!result) throw new Error("Failed to update evaluator") + + return result +} + +export const deleteEvaluatorConfig = async (evaluatorId: string): Promise => { + const {projectId} = getProjectValues() + + await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}/archive?project_id=${projectId}`, + ) + + return true +} + +export const fetchEvaluatorById = async (evaluatorId: string): Promise => { + const {projectId} = getProjectValues() + + const response = await axios.get( + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`, + ) + + return response.data?.evaluator || null +} +``` + +**Deliverables:** +- [ ] Replace `fetchAllEvaluatorConfigs` implementation +- [ ] Replace `createEvaluatorConfig` implementation +- [ ] Replace `updateEvaluatorConfig` implementation +- [ ] Replace `deleteEvaluatorConfig` implementation +- [ ] Add helper functions for URI handling +- [ ] Remove legacy endpoint calls + +--- + +### Phase 1.3: State/Atoms Changes + +**File:** `web/oss/src/state/evaluators/atoms.ts` + +Update query atoms to return `SimpleEvaluator[]`: + +```typescript +export const evaluatorConfigsQueryAtomFamily = atomFamily((projectId: string | null) => + atomWithQuery(() => ({ + queryKey: ["evaluator-configs", projectId], + queryFn: () => fetchAllEvaluatorConfigs(null, projectId), + enabled: !!projectId, + })) +) + +// Derived atom for non-archived evaluators +export const nonArchivedEvaluatorsAtom = atom((get) => { + const projectId = get(projectIdAtom) + if (!projectId) return [] + + const query = get(evaluatorConfigsQueryAtomFamily(projectId)) + const evaluators = query.data ?? [] + + // Filter out archived (deleted_at is set) + return evaluators.filter((e) => !e.deleted_at) +}) +``` + +**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` + +Update playground atoms to use `SimpleEvaluator`: + +```typescript +// Session now stores SimpleEvaluator instead of EvaluatorConfig +export interface PlaygroundSession { + evaluator: Evaluator // template (unchanged) + simpleEvaluator?: SimpleEvaluator // existing config being edited + mode: "create" | "edit" | "clone" +} + +export const playgroundSessionAtom = atom(null) + +// Edit values now use SimpleEvaluator shape +export const playgroundEditValuesAtom = atom | null>(null) + +// Derived: get evaluator_key from URI +export const playgroundEvaluatorKeyAtom = atom((get) => { + const session = get(playgroundSessionAtom) + if (!session) return null + + // From template + if (session.evaluator?.key) return session.evaluator.key + + // From existing SimpleEvaluator + if (session.simpleEvaluator?.data?.uri) { + return extractEvaluatorKeyFromUri(session.simpleEvaluator.data.uri) + } + + return null +}) +``` + +**Deliverables:** +- [ ] Update `evaluatorConfigsQueryAtomFamily` return type +- [ ] Update playground session atoms +- [ ] Update `playgroundEditValuesAtom` shape +- [ ] Add derived atoms for backward-compatible access (e.g., `evaluator_key`) + +--- + +### Phase 1.4: Component Changes + +#### ConfigureEvaluator/index.tsx + +Key changes: +- Form fields read/write to `data.parameters` instead of `settings_values` +- On commit, build `SimpleEvaluatorCreate` or `SimpleEvaluatorEdit` +- Load existing config as `SimpleEvaluator` + +```typescript +// Before +form.setFieldsValue({ + name: editEvalEditValues.name, + settings_values: editEvalEditValues.settings_values, +}) + +// After (use parameters field to match SimpleEvaluator) +form.setFieldsValue({ + name: simpleEvaluator.name, + parameters: simpleEvaluator.data?.parameters, +}) +``` + +#### useEvaluatorsRegistryData.ts + +Update to work with `SimpleEvaluator[]`: + +```typescript +// Derive evaluator_key for display +const enrichedEvaluators = evaluators.map((e) => ({ + ...e, + evaluator_key: extractEvaluatorKeyFromUri(e.data?.uri), + parameters: e.data?.parameters, +})) +``` + +#### getColumns.tsx + +Update column accessors: + +```typescript +// Before +dataIndex: "evaluator_key" + +// After +dataIndex: ["data", "uri"], +render: (uri) => extractEvaluatorKeyFromUri(uri) +``` + +**Deliverables:** +- [ ] Update ConfigureEvaluator form bindings +- [ ] Update commit logic to use new service functions +- [ ] Update useEvaluatorsRegistryData hook +- [ ] Update table columns in getColumns.tsx +- [ ] Update any other components that read evaluator configs + +--- + +### Phase 1.5: Testing + +**Test Cases:** + +1. **List Evaluators** + - [ ] Registry shows all existing evaluator configs + - [ ] Correct names, types, icons displayed + - [ ] Filtering and search work + - [ ] Archived evaluators hidden + +2. **Create Evaluator** + - [ ] Select template → Configure → Commit works + - [ ] Settings (parameters) saved correctly + - [ ] URI generated correctly from evaluator_key + - [ ] Slug generated from name + +3. **Edit Evaluator** + - [ ] Load existing config into form + - [ ] Form populated with current values from `data.parameters` + - [ ] Update name and settings + - [ ] Changes persisted + +4. **Delete Evaluator** + - [ ] Archive endpoint called + - [ ] Evaluator removed from list + - [ ] No errors + +5. **Run Evaluator (legacy endpoint - still works)** + - [ ] Run evaluator button works + - [ ] Uses evaluator_key derived from URI + - [ ] Results displayed correctly + +**Deliverables:** +- [ ] Manual test all flows +- [ ] Fix any bugs found +- [ ] Document any edge cases + +--- + +### PR 1 Summary + +| Task | Files | Effort | +|------|-------|--------| +| Type definitions | `Types.ts` | 0.5 day | +| Service layer | `services/evaluators/index.ts` | 1 day | +| State/atoms | `state/evaluators/atoms.ts`, playground atoms | 1 day | +| Components | ConfigureEvaluator, Registry, columns | 1-2 days | +| Testing | Manual testing | 1 day | + +**Total PR 1 Effort:** 4-5 days + +--- + +## PR 2: Run Migration + +**Goal:** Replace legacy `/evaluators/{key}/run` with native workflow invoke `/preview/workflows/invoke`. + +**Prerequisite:** PR 1 merged and stable. + +### Phase 2.1: WorkflowService Types + +**File:** `web/oss/src/lib/Types.ts` (add) + +```typescript +// ============ Workflow Service Types ============ + +export interface WorkflowServiceRequestData { + revision?: Record + parameters?: Record // evaluator settings + testcase?: Record + inputs?: Record // merged testcase data + trace?: Record + outputs?: any // prediction/output +} + +export interface WorkflowServiceInterface { + version?: string + uri?: string // e.g., "agenta:builtin:auto_exact_match:v0" + url?: string + headers?: Record + schemas?: Record +} + +export interface WorkflowServiceConfiguration { + script?: Record + parameters?: Record +} + +export interface WorkflowServiceRequest { + version?: string + flags?: Record + interface?: WorkflowServiceInterface + configuration?: WorkflowServiceConfiguration + data?: WorkflowServiceRequestData + references?: Record + links?: Record +} + +export interface WorkflowServiceStatus { + code?: number + message?: string + type?: string + stacktrace?: string | string[] +} + +export interface WorkflowServiceResponseData { + outputs?: any +} + +export interface WorkflowServiceBatchResponse { + version?: string + trace_id?: string + span_id?: string + status?: WorkflowServiceStatus + data?: WorkflowServiceResponseData +} +``` + +--- + +### Phase 2.2: Workflow Invoke Service + +**File:** `web/oss/src/services/workflows/invoke.ts` (new file) + +```typescript +import axios from "@/oss/lib/api/assets/axiosConfig" +import { getAgentaApiUrl } from "@/oss/lib/helpers/utils" +import { getProjectValues } from "@/oss/contexts/project.context" +import { + WorkflowServiceRequest, + WorkflowServiceBatchResponse, + SimpleEvaluator, +} from "@/oss/lib/Types" + +export interface InvokeEvaluatorParams { + evaluator: SimpleEvaluator + inputs: Record // testcase data + any extra inputs + outputs: any // prediction/output from variant + parameters?: Record // override settings (optional) +} + +/** + * Invoke an evaluator using native workflow service + */ +export const invokeEvaluator = async ( + params: InvokeEvaluatorParams +): Promise => { + const { projectId } = getProjectValues() + const { evaluator, inputs, outputs, parameters } = params + + const uri = evaluator.data?.uri + if (!uri) { + throw new Error("Evaluator has no URI configured") + } + + const request: WorkflowServiceRequest = { + version: "2025.07.14", + interface: { + uri, + }, + configuration: { + parameters: parameters ?? evaluator.data?.parameters, + }, + data: { + inputs, + outputs, + parameters: parameters ?? evaluator.data?.parameters, + }, + } + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/workflows/invoke?project_id=${projectId}`, + request, + ) + + return response.data +} + +/** + * Map workflow response to evaluator output format + */ +export function mapWorkflowResponseToEvaluatorOutput( + response: WorkflowServiceBatchResponse +): { outputs: Record } { + if (response.status?.code && response.status.code >= 400) { + throw new Error(response.status.message || "Evaluator execution failed") + } + + return { + outputs: response.data?.outputs ?? {}, + } +} +``` + +--- + +### Phase 2.3: Update DebugSection + +**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` + +Replace `createEvaluatorRunExecution` with `invokeEvaluator`: + +```typescript +// Before +const runResponse = await createEvaluatorRunExecution( + selectedEvaluator.key, + { + inputs: outputs, + settings: formValues.parameters, + } +) + +// After +import { invokeEvaluator, mapWorkflowResponseToEvaluatorOutput } from "@/oss/services/workflows/invoke" + +const workflowResponse = await invokeEvaluator({ + evaluator: simpleEvaluator, // from playground state + inputs: { + ...testcaseData, + prediction: variantOutput, + }, + outputs: variantOutput, + parameters: formValues.parameters, // current form settings +}) + +const runResponse = mapWorkflowResponseToEvaluatorOutput(workflowResponse) +``` + +**Error Handling:** + +```typescript +try { + const workflowResponse = await invokeEvaluator(...) + + // Check for workflow-level errors + if (workflowResponse.status?.code && workflowResponse.status.code >= 400) { + message.error(workflowResponse.status.message || "Evaluator failed") + return + } + + const result = mapWorkflowResponseToEvaluatorOutput(workflowResponse) + setEvaluatorResult(result.outputs) + +} catch (error) { + message.error(getErrorMessage(error)) +} +``` + +--- + +### Phase 2.4: Update Evaluations Service (if needed) + +If other parts of the app use `createEvaluatorRunExecution`, update them too: + +**File:** `web/oss/src/services/evaluations/api_ee/index.ts` + +- Keep `createEvaluatorRunExecution` for now (batch evaluations may still use it via backend) +- Or deprecate and point to new invoke + +--- + +### Phase 2.5: Testing + +**Test Cases:** + +1. **Run Evaluator in Playground** + - [ ] Click "Run Evaluator" with testcase loaded + - [ ] Native invoke endpoint called + - [ ] Results displayed correctly + - [ ] Errors handled gracefully + +2. **Different Evaluator Types** + - [ ] Test exact_match evaluator + - [ ] Test regex evaluator + - [ ] Test AI critique evaluator (LLM-based) + - [ ] Test custom code evaluator + +3. **Error Scenarios** + - [ ] Invalid evaluator (no URI) + - [ ] Missing inputs + - [ ] Evaluator execution error + - [ ] Network error + +4. **Permissions** + - [ ] User with RUN_WORKFLOWS permission can run + - [ ] User without permission gets appropriate error + +**Deliverables:** +- [ ] Manual test all evaluator types +- [ ] Fix any bugs found +- [ ] Verify error messages are user-friendly + +--- + +### PR 2 Summary + +| Task | Files | Effort | +|------|-------|--------| +| Workflow types | `Types.ts` | 0.5 day | +| Invoke service | `services/workflows/invoke.ts` | 0.5 day | +| DebugSection update | `DebugSection.tsx` | 1 day | +| Error handling | Various | 0.5 day | +| Testing | Manual testing | 1 day | + +**Total PR 2 Effort:** 3-4 days + +--- + +## Timeline Summary + +| PR | Tasks | Effort | Dependencies | +|----|-------|--------|--------------| +| PR 1: CRUD Migration | Types, services, atoms, components | 4-5 days | Backend PR #3527 merged | +| PR 2: Run Migration | Workflow types, invoke service, DebugSection | 3-4 days | PR 1 merged and stable | + +**Total Implementation:** 7-9 days + +--- + +## Rollback Plan + +### PR 1 Rollback +- Revert PR 1 commit +- Legacy endpoints still exist on backend for a period + +### PR 2 Rollback +- Revert PR 2 commit +- Fall back to legacy `/evaluators/{key}/run` (still supported) + +--- + +## Open Questions + +1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions. + +2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? + +3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`? + +4. **Trace linking:** Should the playground display trace_id from workflow response for debugging? diff --git a/docs/design/migrate-evaluator-playground/research.md b/docs/design/migrate-evaluator-playground/research.md new file mode 100644 index 0000000000..eda511d37b --- /dev/null +++ b/docs/design/migrate-evaluator-playground/research.md @@ -0,0 +1,211 @@ +# Research Notes: Evaluator Execution Architecture + +## Findings from PR #3527 Investigation + +### Discovery: Native Evaluator Execution Path + +The new architecture treats evaluators as workflows with URI-based identification. The key discovery is that even the legacy `/evaluators/{key}/run/` endpoint now uses the native handler registry internally. + +### Handler Registry Architecture + +The SDK maintains a global registry of workflow handlers: + +**Location:** `sdk/agenta/sdk/workflows/utils.py` + +```python +HANDLER_REGISTRY = { + "agenta": { + "builtin": { + "echo": {"v0": echo_v0}, + "auto_exact_match": {"v0": auto_exact_match_v0}, + "auto_regex_test": {"v0": auto_regex_test_v0}, + "field_match_test": {"v0": field_match_test_v0}, + "json_multi_field_match": {"v0": json_multi_field_match_v0}, + "auto_webhook_test": {"v0": auto_webhook_test_v0}, + "auto_custom_code_run": {"v0": auto_custom_code_run_v0}, + "auto_ai_critique": {"v0": auto_ai_critique_v0}, + # ... more evaluators + } + }, + "user": { + "custom": { + # Custom user evaluators + } + } +} +``` + +**URI Format:** `provider:kind:key:version` + +Examples: +- `agenta:builtin:auto_exact_match:v0` +- `user:custom:my_custom_eval:latest` + +**URI Parsing:** +```python +def parse_uri(uri: str) -> Tuple[provider, kind, key, version]: + # "agenta:builtin:echo:v0" → ("agenta", "builtin", "echo", "v0") +``` + +### How the Legacy Run Endpoint Works Now (PR #3527) + +**File:** `api/oss/src/routers/evaluators_router.py` + +The PR changed the implementation to use the native handler registry: + +```python +@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface) +async def evaluator_run(request: Request, evaluator_key: str, payload: EvaluatorInputInterface): + # ... auth setup ... + result = await _run_evaluator(evaluator_key, payload) + return result + +async def _run_evaluator(evaluator_key: str, evaluator_input: EvaluatorInputInterface): + # Build URI from evaluator_key + uri = f"agenta:builtin:{evaluator_key}:v0" + + # Retrieve the handler from SDK registry + handler = retrieve_handler(uri) + if handler is None: + raise NotImplementedError(f"Evaluator {evaluator_key} not found (uri={uri})") + + # Extract data from evaluator_input + inputs = evaluator_input.inputs or {} + settings = evaluator_input.settings or {} + outputs = inputs.get("prediction", inputs.get("output")) + + # Build kwargs based on handler signature + sig = inspect.signature(handler) + kwargs = {} + if "parameters" in sig.parameters: + kwargs["parameters"] = settings + if "inputs" in sig.parameters: + kwargs["inputs"] = inputs + if "outputs" in sig.parameters: + kwargs["outputs"] = outputs + + # Invoke the handler + result = handler(**kwargs) + if inspect.iscoroutine(result): + result = await result + + return {"outputs": result} +``` + +**Key Insight:** The legacy endpoint is now a thin wrapper that: +1. Builds the URI from the evaluator_key +2. Looks up the handler in the registry +3. Invokes it directly + +### Native Workflow Invoke Path + +For fully native execution, there's also a generic workflow invoke endpoint: + +**Endpoint:** `POST /preview/workflows/invoke` + +**Request Structure:** +```python +class WorkflowServiceRequest: + data: WorkflowServiceRequestData # inputs, outputs, parameters + revision: Optional[dict] # contains URI in data.uri +``` + +**How Batch Evaluations Use It:** + +**File:** `api/oss/src/core/evaluations/tasks/legacy.py` (lines 1185-1228) + +```python +workflow_service_request_data = WorkflowServiceRequestData( + inputs=inputs, + outputs=outputs, + # + parameters=evaluator_reference.get("configuration"), # settings +) + +workflow_service_request = WorkflowServiceRequest( + data=workflow_service_request_data, + # + environment=environment, + revision=evaluator_reference.get("revision"), # contains URI +) + +await workflows_service.invoke_workflow( + project_id=project_id, + user_id=user_id, + request=workflow_service_request, +) +``` + +### Implications for Frontend Migration + +#### For Evaluator CRUD (Create/Read/Update/Delete) + +**Must migrate to new endpoints** because: +- Legacy endpoints now call SimpleEvaluator endpoints internally +- Data is stored in new workflow-based format +- Frontend should use native API to avoid translation overhead + +#### For Evaluator Run (Testing in Playground) + +**Options:** + +1. **Keep using `/evaluators/{key}/run/`** (Recommended for now) + - Simplest approach + - Endpoint still works + - Internally uses native path + - No frontend changes needed + +2. **Use native workflow invoke** + - Requires building `WorkflowServiceRequest` + - Need to include evaluator revision with URI + - More complex but more "correct" + - Enables custom evaluator support + +3. **Hybrid approach** + - Use legacy endpoint for built-in evaluators + - Use native invoke for custom evaluators (which will have custom URIs) + +### Questions Resolved + +**Q: Why does the legacy run endpoint remain unchanged?** + +A: It's not unchanged internally - PR #3527 refactored it to use the native handler registry. But the external interface (URL, request/response format) is preserved for backward compatibility. + +**Q: Is there a "native" way to run evaluators?** + +A: Yes, via the workflow invoke endpoint with `WorkflowServiceRequest` containing the evaluator's URI. But for the playground, the legacy endpoint is simpler and equivalent. + +**Q: Should we migrate the run endpoint usage?** + +A: Not necessarily. The benefits of migrating would be: +- Consistency with new architecture +- Support for custom evaluators with custom URIs +- Ability to run specific evaluator revisions + +But the costs are: +- More complex payload construction +- Need to fetch evaluator revision to get URI +- No immediate user-facing benefit + +**Recommendation:** Keep using legacy run endpoint for now, plan native invoke for custom evaluator feature. + +## Note on "Qdrant changes" + +Within this repository, Qdrant appears in examples and cookbook/tutorial code (e.g., `examples/python/*`, `docs/docs/tutorials/*`), but not in the core evaluator/workflow execution path under `api/oss/src`. + +Implication for this migration: +- Migrating the evaluator playground to `/preview/workflows/invoke` does not require any Qdrant-specific frontend changes. +- Any Qdrant-related behavior is part of the *application/workflow being evaluated* (e.g., a RAG app calling Qdrant), and would surface only through normal workflow invocation inputs/outputs/traces. + +--- + +## Related Files Analyzed + +- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (now with native internals) +- `api/oss/src/apis/fastapi/evaluators/router.py` - New SimpleEvaluators router +- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint +- `api/oss/src/core/workflows/service.py` - Workflow invocation service +- `api/oss/src/core/evaluations/tasks/legacy.py` - Batch evaluation using native invoke +- `sdk/agenta/sdk/workflows/utils.py` - Handler registry and URI parsing +- `sdk/agenta/sdk/workflows/interfaces.py` - Evaluator interfaces (schemas) +- `sdk/agenta/sdk/workflows/handlers.py` - Actual evaluator implementations diff --git a/docs/design/migrate-evaluator-playground/risk-analysis.md b/docs/design/migrate-evaluator-playground/risk-analysis.md new file mode 100644 index 0000000000..3c522d441a --- /dev/null +++ b/docs/design/migrate-evaluator-playground/risk-analysis.md @@ -0,0 +1,320 @@ +# Risk Analysis: Evaluator Playground Migration + +## Coupling Points + +### 1. State Management Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` + +**Risk Level:** MEDIUM + +The playground state is tightly coupled to the `EvaluatorConfig` shape: + +```typescript +// playgroundEditValuesAtom expects EvaluatorConfig shape +interface EvaluatorConfig { + id: string + evaluator_key: string + name: string + settings_values: Record +} +``` + +**Impact:** +- `commitPlaygroundAtom` expects `EvaluatorConfig` as input +- `playgroundEditValuesAtom` is read throughout ConfigureEvaluator and DebugSection +- Form initialization relies on `settings_values` property name + +**Mitigation (PR 1):** +- Update atoms to use `SimpleEvaluator` shape directly +- Add derived atoms for backward-compatible access (e.g., `evaluator_key` from URI) +- Update all atom consumers in ConfigureEvaluator and DebugSection + +--- + +### 2. Form Initialization Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx` + +**Risk Level:** MEDIUM + +Form initialization directly accesses `settings_values`: + +```typescript +// Line 383-410 +if (editMode && editEvalEditValues) { + form.setFieldsValue({ + ...editEvalEditValues, + settings_values: editEvalEditValues.settings_values || {}, + }) +} +``` + +**Impact:** +- Changing to `data.parameters` would break form binding +- DynamicFormField components use `["settings_values", field.key]` name paths + +**Mitigation (PR 1):** +- Update form field names from `settings_values` to `parameters` +- Update DynamicFormField name paths +- Update form.getFieldsValue() to extract `parameters` + +--- + +### 3. Service Layer Coupling + +**Location:** `web/oss/src/services/evaluators/index.ts` + +**Risk Level:** LOW-MEDIUM + +API calls directly construct legacy payload shapes: + +```typescript +// createEvaluatorConfig +return axios.post(`/evaluators/configs?project_id=${projectId}`, { + ...config, +}) + +// updateEvaluatorConfig +return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config) +``` + +**Impact:** +- Need to update URLs and payload transformation +- Response handling needs to unwrap `{ evaluator: ... }` wrapper + +**Mitigation (PR 1):** +- Replace all service functions with new implementations +- New functions build `SimpleEvaluator` payloads directly +- Handle response wrapper in service layer + +--- + +### 4. Evaluators Registry Coupling + +**Location:** `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` + +**Risk Level:** MEDIUM + +The hook transforms and combines data from two sources: + +```typescript +const {evaluatorConfigs} = useFetchEvaluatorsData() +// Combines with evaluator templates for display +``` + +**Impact:** +- Table columns expect `evaluator_key` property +- Tag cells, type pills depend on config shape +- Filtering/search operates on legacy property names + +**Mitigation (PR 1):** +- Update hook to work with `SimpleEvaluator[]` +- Derive `evaluator_key` from `data.uri` for display +- Update column accessors in getColumns.tsx + +--- + +### 5. Debug Section - Evaluator Run Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` + +**Risk Level:** MEDIUM (PR 2) + +The evaluator run uses legacy endpoint: + +```typescript +const runResponse = await createEvaluatorRunExecution( + selectedEvaluator.key, // evaluator_key + { inputs: outputs, settings: ... } +) +``` + +**Impact:** +- Must migrate to `/preview/workflows/invoke` +- Need to construct `WorkflowServiceRequest` +- Different error handling (workflow status vs HTTP errors) + +**Mitigation (PR 2):** +- Create new `invokeEvaluator()` service function +- Build `WorkflowServiceRequest` with URI from `SimpleEvaluator.data.uri` +- Map workflow response/errors to UI + +--- + +### 6. Global Atoms Coupling + +**Location:** `web/oss/src/state/evaluators/atoms.ts` + +**Risk Level:** MEDIUM + +Query atoms return legacy-shaped data: + +```typescript +const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) => + atomWithQuery(() => ({ + queryKey: ['evaluator-configs', projectId], + queryFn: () => fetchAllEvaluatorConfigs(null, projectId), + })) +) +``` + +**Impact:** +- Multiple components may depend on these atoms +- Changing shape could cascade through application + +**Mitigation (PR 1):** +- Update service function to return `SimpleEvaluator[]` +- Update all consumers to handle new shape +- Change in one place (service), ripple through atoms automatically + +--- + +### 7. Evaluator Templates vs Configs Distinction + +**Location:** Throughout frontend + +**Risk Level:** LOW + +The frontend distinguishes between: +- **Evaluator templates** (`Evaluator`): Built-in evaluator definitions with `settings_template` +- **Evaluator configs** (`SimpleEvaluator`): User-created configurations with `data.parameters` + +**Impact:** +- This distinction is maintained in the new system +- Templates come from `/evaluators/` (unchanged) +- Configs become `SimpleEvaluator` objects + +**Mitigation:** +- No conceptual change needed +- Templates API unchanged +- Just update config handling + +--- + +## Risk Summary Table + +| Component | Risk Level | PR | Priority | +|-----------|-----------|-----|----------| +| Service Layer | LOW-MEDIUM | PR 1 | HIGH (change first) | +| State Atoms | MEDIUM | PR 1 | HIGH | +| ConfigureEvaluator Form | MEDIUM | PR 1 | MEDIUM | +| Evaluators Registry | MEDIUM | PR 1 | MEDIUM | +| Global Query Atoms | MEDIUM | PR 1 | MEDIUM | +| Debug Section (Run) | MEDIUM | PR 2 | MEDIUM | + +## Concrete Breakage Scenarios + +### Scenario 1: Form Submission Fails + +**Trigger:** Form still uses `settings_values` but service expects `parameters` + +**Symptoms:** +- Form submits but settings are lost +- Backend receives empty configuration +- Evaluator created but doesn't work + +**Prevention:** +- Update form field names to `parameters` +- Test form submission with real backend +- Verify payload in network tab + +--- + +### Scenario 2: Evaluator List Empty + +**Trigger:** Query endpoint returns `SimpleEvaluator[]`, UI expects `EvaluatorConfig[]` + +**Symptoms:** +- Evaluators registry shows empty list +- No error messages (data exists but unparseable) +- Console shows undefined property access + +**Prevention:** +- Update all components to use `SimpleEvaluator` shape +- Add null checks for `data?.uri`, `data?.parameters` +- Log transformation errors + +--- + +### Scenario 3: Edit Mode Fails to Load + +**Trigger:** Component expects `settings_values`, receives `data.parameters` + +**Symptoms:** +- Navigate to edit page, form is empty +- Settings not populated +- Save overwrites with empty config + +**Prevention:** +- Update form initialization to read from `data.parameters` +- Test edit flow with existing configs + +--- + +### Scenario 4: Delete Fails Silently + +**Trigger:** `DELETE` endpoint no longer exists, `POST .../archive` required + +**Symptoms:** +- Click delete, no error +- Evaluator still appears +- Network tab shows 404/405 + +**Prevention:** +- Update delete function to use archive endpoint +- Verify response handling + +--- + +### Scenario 5: Evaluator Run Fails (PR 2) + +**Trigger:** Workflow invoke returns different response shape + +**Symptoms:** +- Run button shows error +- Results not displayed +- Console shows parsing errors + +**Prevention:** +- Map `WorkflowServiceBatchResponse` to expected output format +- Handle `status.code` errors from workflow response +- Test with all evaluator types + +--- + +## Recommended Testing Strategy + +### PR 1 Testing + +**Unit Tests:** +- [ ] URI parsing (`agenta:builtin:key:v0` → `key`) +- [ ] Slug generation from name +- [ ] Service function request/response handling + +**Integration Tests:** +- [ ] Create evaluator config flow +- [ ] Edit evaluator config flow +- [ ] Delete (archive) evaluator config flow +- [ ] List/query evaluator configs flow + +**E2E Tests:** +- [ ] Full playground flow: select template → configure → test → commit +- [ ] Edit existing evaluator configuration +- [ ] Clone evaluator configuration +- [ ] Delete evaluator configuration + +### PR 2 Testing + +**Unit Tests:** +- [ ] `WorkflowServiceRequest` construction +- [ ] Response mapping to evaluator output format +- [ ] Error status handling + +**Integration Tests:** +- [ ] Run evaluator with different types (exact_match, regex, AI critique) +- [ ] Error scenarios (invalid inputs, missing outputs) + +**Regression Tests:** +- [ ] Existing configs load correctly +- [ ] Batch evaluations still work (they use backend workflow invoke) diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md new file mode 100644 index 0000000000..dbce737e8f --- /dev/null +++ b/docs/design/migrate-evaluator-playground/status.md @@ -0,0 +1,136 @@ +# Status: Evaluator Playground Migration + +## Current Phase: PR 1 (CRUD) In Progress + +**Last Updated:** 2026-01-27 + +--- + +## Chosen Approach + +**Direct Migration (No Adapters)** - Split into two PRs: + +1. **PR 1:** CRUD migration to `SimpleEvaluator` endpoints +2. **PR 2:** Run migration to native workflow invoke + +See [plan.md](./plan.md) for detailed implementation steps. + +--- + +## Progress Summary + +### Completed + +- [x] Map current Evaluator Playground implementation + - Identified all frontend components + - Documented state management (atoms) + - Mapped API endpoints used + - Documented data flow + +- [x] Analyze PR #3527 (backend migration) + - Understood new `SimpleEvaluator` data model + - Documented new endpoint shapes + - Identified backward compatibility layer + +- [x] Investigate native evaluator execution path + - Confirmed `/evaluators/{key}/run` now resolves `agenta:builtin:{key}:v0` via SDK handler registry + - Confirmed native workflow execution endpoint exists: `POST /preview/workflows/invoke` + - Documented request structure used by batch evaluation tasks + +- [x] Compare old vs new endpoints + - Documented request/response differences + - Identified URI-based evaluator identification + - Noted response wrapper changes + +- [x] Identify coupling and risk areas + - State management coupling (MEDIUM risk) + - Form initialization coupling (MEDIUM risk) + - Service layer coupling (LOW-MEDIUM risk) + - Created risk mitigation strategies + +- [x] Finalize migration plan + - Chose direct migration (no adapters) + - Split into PR 1 (CRUD) and PR 2 (Run) + - Documented all file changes needed + +### Next Steps + +- [ ] Complete PR 1: CRUD migration (stacked on PR #3527) +- [ ] After PR 1 stable, start PR 2: Run migration + +--- + +## Key Decisions + +| Decision | Rationale | Date | +|----------|-----------|------| +| Direct migration (no adapters) | Avoids tech debt, aligns with new architecture | 2026-01-27 | +| Two-PR approach | Keeps changes reviewable, allows CRUD to stabilize first | 2026-01-27 | +| Internal shapes become `SimpleEvaluator` | Matches backend model, no translation layer | 2026-01-27 | + +--- + +## Key Findings + +### 1. The `/evaluators/{key}/run/` endpoint is a thin wrapper + +PR #3527 refactored the legacy run endpoint to use the native handler registry internally: +- It builds a URI from the evaluator_key: `agenta:builtin:{key}:v0` +- Uses `retrieve_handler(uri)` to get the actual handler function +- Directly invokes the handler + +### 2. Native workflow invoke path exists + +There's a fully native way to run evaluators: +- Endpoint: `POST /preview/workflows/invoke` +- Uses `WorkflowServiceRequest` with URI in interface +- Same mechanism used by batch evaluations + +### 3. URI-based handler registry + +The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions: +- Format: `agenta:builtin:{evaluator_key}:v0` +- Supports custom evaluators: `user:custom:my_eval:latest` +- Enables version management of evaluator implementations + +### 4. Key mapping changes + +| Legacy | New | +|--------|-----| +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `EvaluatorConfig` | `SimpleEvaluator` | + +--- + +## Open Questions + +1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions. + +2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? + +3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`? + +--- + +## Effort Estimates + +| PR | Effort | Dependencies | +|----|--------|--------------| +| PR 1: CRUD Migration | 4-5 days | Backend PR #3527 merged | +| PR 2: Run Migration | 3-4 days | PR 1 merged and stable | + +**Total:** 7-9 days implementation + +--- + +## Related Links + +- [PR #3527: Migrate evaluators but keep legacy endpoints](https://github.com/Agenta-AI/agenta/pull/3527) +- [context.md](./context.md) - Background and goals +- [current-system.md](./current-system.md) - Current implementation details +- [new-endpoints.md](./new-endpoints.md) - New endpoint documentation +- [research.md](./research.md) - Handler registry and execution research +- [migration-options.md](./migration-options.md) - Why we chose direct migration +- [risk-analysis.md](./risk-analysis.md) - Coupling and risk analysis +- [plan.md](./plan.md) - Detailed implementation plan diff --git a/web/oss/src/components/Evaluators/assets/types.ts b/web/oss/src/components/Evaluators/assets/types.ts index f928cdc801..ccfdfaaa06 100644 --- a/web/oss/src/components/Evaluators/assets/types.ts +++ b/web/oss/src/components/Evaluators/assets/types.ts @@ -1,5 +1,5 @@ import {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" export type EvaluatorCategory = "automatic" | "human" @@ -15,7 +15,7 @@ export type EvaluatorPreview = EvaluatorPreviewDto & { metrics?: Record } -export type EvaluatorConfigRow = EvaluatorConfig & { +export type EvaluatorConfigRow = SimpleEvaluator & { evaluator?: Evaluator | null kind?: "config" } diff --git a/web/oss/src/components/Evaluators/assets/utils.ts b/web/oss/src/components/Evaluators/assets/utils.ts index 4b09fa2d46..a750ce248f 100644 --- a/web/oss/src/components/Evaluators/assets/utils.ts +++ b/web/oss/src/components/Evaluators/assets/utils.ts @@ -1,6 +1,7 @@ +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {formatDay} from "@/oss/lib/helpers/dateTimeHelper" import {capitalize} from "@/oss/lib/helpers/utils" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import { EvaluatorCategory, @@ -54,7 +55,7 @@ const formatDate = (value?: string) => { return formatDay({date: value}) } -const collectConfigTags = (config: EvaluatorConfig, evaluator?: Evaluator | null) => { +const collectConfigTags = (config: SimpleEvaluator, evaluator?: Evaluator | null) => { const tags = new Set() if (Array.isArray(config.tags)) { @@ -132,11 +133,12 @@ export const transformEvaluatorsToRows = ( } const buildConfigTypeBadge = ( - config: EvaluatorConfig, + config: SimpleEvaluator, category: Extract, evaluator?: Evaluator | null, ): EvaluatorTypeBadge => { - const label = evaluator?.name || createTypeLabel(config.evaluator_key, config.name) + const evaluatorKey = resolveEvaluatorKey(config) + const label = evaluator?.name || createTypeLabel(evaluatorKey, config.name) const colorHex = config.color || evaluator?.color return { @@ -146,44 +148,54 @@ const buildConfigTypeBadge = ( } } -const extractConfigVersion = (config: EvaluatorConfig) => { - const serviceValues = (config.settings_values as any)?.service || {} +const extractConfigVersion = (config: SimpleEvaluator) => { + const parameters = (config.data as any)?.parameters || {} + const serviceValues = (config.data as any)?.service || {} + const serviceConfig = serviceValues?.configuration || {} const candidate = (config as any)?.version || serviceValues?.agenta || serviceValues?.version || - (config.settings_values as any)?.version || + serviceConfig?.version || + serviceConfig?.agenta || + parameters?.version || "" return sanitizeVersion(typeof candidate === "string" ? candidate : "") } -const extractConfigModifiedBy = (config: EvaluatorConfig) => { +const extractConfigModifiedBy = (config: SimpleEvaluator) => { const modifiedBy = (config as any)?.updated_by || (config as any)?.updatedBy || + (config as any)?.updated_by_id || + (config as any)?.updatedById || (config as any)?.created_by || (config as any)?.createdBy || + (config as any)?.created_by_id || + (config as any)?.createdById || "" return typeof modifiedBy === "string" ? modifiedBy : "" } export const transformEvaluatorConfigsToRows = ( - configs: EvaluatorConfig[], + configs: SimpleEvaluator[], category: Extract, evaluators: Evaluator[], ): EvaluatorRegistryRow[] => { const evaluatorsMap = new Map(evaluators.map((item) => [item.key, item])) return configs.map((config) => { - const evaluator = evaluatorsMap.get(config.evaluator_key) || null + const evaluatorKey = resolveEvaluatorKey(config) + const evaluator = evaluatorKey ? evaluatorsMap.get(evaluatorKey) || null : null const badge = buildConfigTypeBadge(config, category, evaluator) const versionLabel = extractConfigVersion(config) const tags = collectConfigTags(config, evaluator) const modifiedBy = extractConfigModifiedBy(config) const createdAt = config.created_at const updatedAt = config.updated_at || createdAt + const displayName = config.name || evaluator?.name || evaluatorKey || config.slug || "" const raw: EvaluatorConfigRow = { ...config, @@ -194,15 +206,15 @@ export const transformEvaluatorConfigsToRows = ( return { key: config.id, id: config.id, - name: config.name, - slug: config.evaluator_key, + name: displayName, + slug: evaluatorKey || config.slug, typeBadge: badge, versionLabel, tags, dateCreated: formatDate(createdAt), lastModified: formatDate(updatedAt), modifiedBy, - avatarName: modifiedBy || config.name, + avatarName: modifiedBy || displayName, raw, } }) diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx index ca07709a52..e1494219ab 100644 --- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx @@ -25,6 +25,7 @@ import { resetPlaygroundAtom, } from "@/oss/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms" import useURL from "@/oss/hooks/useURL" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" import {Evaluator} from "@/oss/lib/Types" import {evaluatorByKeyAtomFamily} from "@/oss/state/evaluators" @@ -63,7 +64,7 @@ const ConfigureEvaluatorPage = ({evaluatorId}: {evaluatorId?: string | null}) => ) }, [evaluatorConfigs, evaluatorId, stagedConfig]) - const evaluatorKey = existingConfig?.evaluator_key ?? evaluatorId ?? null + const evaluatorKey = resolveEvaluatorKey(existingConfig) ?? evaluatorId ?? null const evaluatorQuery = useAtomValue(evaluatorByKeyAtomFamily(evaluatorKey)) const evaluatorFromRegular = evaluators.find((item) => item.key === evaluatorKey) diff --git a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts index 3aa171dc76..97fbb7ffc4 100644 --- a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts +++ b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts @@ -2,7 +2,7 @@ import {useCallback, useMemo} from "react" import useEvaluators from "@/oss/lib/hooks/useEvaluators" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {EvaluatorCategory, EvaluatorPreview, EvaluatorRegistryRow} from "../assets/types" import { @@ -33,7 +33,7 @@ const useEvaluatorsRegistryData = (category: EvaluatorCategory) => { const humanEvaluators = (humanEvaluatorsSwr.data || []) as EvaluatorPreview[] unsortedRows = transformEvaluatorsToRows(humanEvaluators, "human") } else { - const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as EvaluatorConfig[] + const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as SimpleEvaluator[] const baseEvaluators = (baseEvaluatorsSwr.data || []) as Evaluator[] unsortedRows = transformEvaluatorConfigsToRows( diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx index cffdfdcd23..fd64b589ec 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx @@ -7,6 +7,7 @@ import {useRouter} from "next/router" import {message} from "@/oss/components/AppMessageContext" import useURL from "@/oss/hooks/useURL" import {useVaultSecret} from "@/oss/hooks/useVaultSecret" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {redirectIfNoLLMKeys} from "@/oss/lib/helpers/utils" import useAppVariantRevisions from "@/oss/lib/hooks/useAppVariantRevisions" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" @@ -289,7 +290,7 @@ const NewEvaluationModalInner = ({ !preview && selectedEvalConfigs.some( (id) => - evaluatorConfigs.find((config) => config.id === id)?.evaluator_key === + resolveEvaluatorKey(evaluatorConfigs.find((config) => config.id === id)) === "auto_ai_critique", ) && (await redirectIfNoLLMKeys({secrets})) diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx index 3545f0b98a..b7bd3b649a 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx @@ -11,9 +11,10 @@ import router from "next/router" import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms" import useURL from "@/oss/hooks/useURL" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {openEvaluatorDrawerAtom} from "../../../autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms" import type {SelectEvaluatorSectionProps} from "../../types" @@ -88,12 +89,12 @@ const SelectEvaluatorSection = ({ const evaluatorConfigs = useMemo(() => { if (preview) { - return evaluators as EvaluatorConfig[] + return [] as SimpleEvaluator[] } return ( propsEvaluatorConfigs?.length ? propsEvaluatorConfigs : evaluatorConfigsSwr.data || [] - ) as EvaluatorConfig[] - }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data, evaluators]) + ) as SimpleEvaluator[] + }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data]) const isLoadingEvaluators = fetchLoadingEvaluators const isLoadingEvaluatorConfigs = fetchLoadingConfigs @@ -122,7 +123,7 @@ const SelectEvaluatorSection = ({ const availableIds = new Set( (preview ? (evaluators as EvaluatorDto<"response">[]) - : (evaluatorConfigs as EvaluatorConfig[]) + : (evaluatorConfigs as SimpleEvaluator[]) ).map((config) => config.id), ) @@ -141,10 +142,9 @@ const SelectEvaluatorSection = ({ // Handler to open the drawer in edit mode const handleEditConfig = useCallback( - (record: EvaluatorConfig) => { - const evaluator = (evaluators as Evaluator[]).find( - (e) => e.key === record.evaluator_key, - ) + (record: SimpleEvaluator) => { + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey) if (evaluator) { openEvaluatorDrawer({ evaluator, @@ -158,10 +158,9 @@ const SelectEvaluatorSection = ({ // Handler to open the drawer in clone mode const handleCloneConfig = useCallback( - (record: EvaluatorConfig) => { - const evaluator = (evaluators as Evaluator[]).find( - (e) => e.key === record.evaluator_key, - ) + (record: SimpleEvaluator) => { + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey) if (evaluator) { openEvaluatorDrawer({ evaluator, @@ -203,13 +202,13 @@ const SelectEvaluatorSection = ({ [], ) - const columnsConfig: ColumnsType = useMemo( + const columnsConfig: ColumnsType = useMemo( () => [ { title: "Name", dataIndex: "name", key: "name", - render: (_, record: EvaluatorConfig) => { + render: (_, record: SimpleEvaluator) => { return
{record.name}
}, }, @@ -217,10 +216,11 @@ const SelectEvaluatorSection = ({ title: "Type", dataIndex: "type", key: "type", - render: (x, record: EvaluatorConfig) => { + render: (x, record: SimpleEvaluator) => { // Find the evaluator by key to display its name + const evaluatorKey = resolveEvaluatorKey(record) const evaluator = (evaluators as Evaluator[]).find( - (item) => item.key === record.evaluator_key, + (item) => item.key === evaluatorKey, ) return {evaluator?.name} }, @@ -231,7 +231,7 @@ const SelectEvaluatorSection = ({ width: 56, fixed: "right", align: "center", - render: (_, record: EvaluatorConfig) => { + render: (_, record: SimpleEvaluator) => { return ( ({ // Conditionally type filteredEvalConfigs based on Preview const filteredEvalConfigs: Preview extends true ? EvaluatorDto<"response">[] - : EvaluatorConfig[] = useMemo(() => { + : SimpleEvaluator[] = useMemo(() => { if (preview) { // Explicitly narrow types for Preview = true (human evaluations) let data = evaluators as EvaluatorDto<"response">[] @@ -295,21 +295,21 @@ const SelectEvaluatorSection = ({ if (!searchTerm) return data as any return data.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) as any } else { // Explicitly narrow types for Preview = false - const data = evaluatorConfigs as EvaluatorConfig[] + const data = evaluatorConfigs as SimpleEvaluator[] if (!searchTerm) return data return data.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) as any } }, [searchTerm, evaluatorConfigs, preview, evaluators]) const onSelectEvalConfig = (selectedRowKeys: React.Key[]) => { const currentSelected = new Set(selectedEvalConfigs) - const configs = filteredEvalConfigs as EvaluatorDto<"response">[] + const configs = filteredEvalConfigs as {id: string}[] configs.forEach((item) => { if (selectedRowKeys.includes(item.id)) { currentSelected.add(item.id) @@ -331,7 +331,7 @@ const SelectEvaluatorSection = ({ ).length > 0 ) } - return (evaluatorConfigs as EvaluatorConfig[]).length > 0 + return (evaluatorConfigs as SimpleEvaluator[]).length > 0 }, [preview, evaluators, evaluatorConfigs]) return ( @@ -418,7 +418,7 @@ const SelectEvaluatorSection = ({ pagination={false} /> ) : ( - + rowSelection={{ type: "checkbox", columnWidth: 48, @@ -442,7 +442,7 @@ const SelectEvaluatorSection = ({ className="ph-no-capture" columns={columnsConfig} rowKey={"id"} - dataSource={filteredEvalConfigs as EvaluatorConfig[]} + dataSource={filteredEvalConfigs as SimpleEvaluator[]} scroll={{x: true, y: 455}} bordered pagination={false} diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts index a068971bc4..5f838f8665 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts @@ -4,7 +4,7 @@ import {ModalProps} from "antd" import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types" import {EnhancedVariant} from "@/oss/lib/shared/variant/transformer/types" -import {LLMRunRateLimit, Evaluator, EvaluatorConfig, testset} from "@/oss/lib/Types" +import {LLMRunRateLimit, Evaluator, SimpleEvaluator, testset} from "@/oss/lib/Types" export interface NewEvaluationAppOption { label: string @@ -54,7 +54,7 @@ export interface NewEvaluationModalContentProps extends HTMLProps[] - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] advanceSettings: LLMRunRateLimitWithCorrectAnswer setAdvanceSettings: Dispatch> appOptions: NewEvaluationAppOption[] @@ -95,7 +95,7 @@ export interface SelectTestsetSectionProps extends HTMLProps { } export interface SelectEvaluatorSectionProps extends HTMLProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] evaluators: Evaluator[] selectedEvalConfigs: string[] setSelectedEvalConfigs: Dispatch> diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx index 6957d3438a..6a0aed5f8f 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx @@ -71,7 +71,7 @@ const AdvancedSettings: React.FC = ({settings, selectedTe return ( { setEvalOutputStatus({success: false, error: false}) setIsLoadingResult(true) - const settingsValues = form.getFieldValue("settings_values") || {} - let normalizedSettings = {...settingsValues} + const parameters = form.getFieldValue("parameters") || {} + let normalizedSettings = {...parameters} if (typeof normalizedSettings.json_schema === "string") { try { @@ -419,7 +419,7 @@ const DebugSection = () => { } if (!selectedEvaluator.key.startsWith("rag_")) { - const correctAnswerKey = settingsValues.correct_answer_key + const correctAnswerKey = parameters.correct_answer_key const groundTruthKey = typeof correctAnswerKey === "string" && correctAnswerKey.startsWith("testcase.") ? correctAnswerKey.split(".")[1] diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx index a8128c43e7..c7a3df73f6 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx @@ -105,7 +105,7 @@ export const DynamicFormField: React.FC = ({ form, }) => { const settingsValue = Form.useWatch(name, form) - const runtime = Form.useWatch(["settings_values", "runtime"], form) + const runtime = Form.useWatch(["parameters", "runtime"], form) const classes = useStyles() const {token} = theme.useToken() diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx index a96a07a37f..f5ddf000df 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -55,7 +55,7 @@ export const FieldsTagsEditor: React.FC = ({ // Watch the correct_answer_key from form to react to changes // Using Form.useWatch instead of form.getFieldValue for reactivity - const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form) + const formCorrectAnswerKey = Form.useWatch(["parameters", "correct_answer_key"], form) const effectiveKey = formCorrectAnswerKey || correctAnswerKey // Check if we can detect fields from testcase diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 331afe0852..1454b99565 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -13,7 +13,7 @@ import {useAppId} from "@/oss/hooks/useAppId" import useURL from "@/oss/hooks/useURL" import {EvaluationSettingsTemplate, JSSTheme, SettingsPreset} from "@/oss/lib/Types" import { - CreateEvaluationConfigData, + CreateEvaluatorConfigData, createEvaluatorConfig, updateEvaluatorConfig, } from "@/oss/services/evaluations/api" @@ -69,6 +69,13 @@ interface ConfigureEvaluatorProps { onToggleTestPanel?: () => void } +interface ConfigureEvaluatorFormValues { + name: string + description?: string + tags?: string[] + parameters?: Record +} + const useStyles = createUseStyles((theme: JSSTheme) => ({ collapseContainer: { "& .ant-collapse-header": { @@ -199,12 +206,10 @@ const ConfigureEvaluator = ({ const allKeys = Array.from(new Set([...templateKeys, ...presetKeys])) // Clear subtree before applying new values to avoid stale keys - form.setFieldsValue({settings_values: {}}) + form.setFieldsValue({parameters: {}}) if (allKeys.length) { - const fieldNames = allKeys.map( - (key) => ["settings_values", key] as (string | number)[], - ) + const fieldNames = allKeys.map((key) => ["parameters", key] as (string | number)[]) form.resetFields(fieldNames) const nextFields = fieldNames @@ -248,7 +253,7 @@ const ConfigureEvaluator = ({ const evaluatorVersionNumber = useMemo(() => { const raw = - editEvalEditValues?.settings_values?.version ?? + editEvalEditValues?.data?.parameters?.version ?? selectedEvaluator?.settings_template?.version?.default ?? 3 @@ -256,7 +261,7 @@ const ConfigureEvaluator = ({ // extract leading number (e.g., "4", "4.1", "v4") const match = String(raw).match(/\d+(\.\d+)?/) return match ? parseFloat(match[0]) : 3 - }, [editEvalEditValues?.settings_values?.version, selectedEvaluator]) + }, [editEvalEditValues?.data?.parameters?.version, selectedEvaluator]) const evalFields = useMemo(() => { const templateEntries = Object.entries(selectedEvaluator?.settings_template || {}) @@ -283,28 +288,25 @@ const ConfigureEvaluator = ({ const advancedSettingsFields = evalFields.filter((field) => field.advanced) const basicSettingsFields = evalFields.filter((field) => !field.advanced) - const onSubmit = async (values: CreateEvaluationConfigData) => { + const onSubmit = async (values: ConfigureEvaluatorFormValues) => { try { setSubmitLoading(true) if (!selectedEvaluator?.key) throw new Error("No selected key") - const settingsValues = values.settings_values || {} + const parameters = values.parameters || {} - const jsonSchemaFieldPath: (string | number)[] = ["settings_values", "json_schema"] - const hasJsonSchema = Object.prototype.hasOwnProperty.call( - settingsValues, - "json_schema", - ) + const jsonSchemaFieldPath: (string | number)[] = ["parameters", "json_schema"] + const hasJsonSchema = Object.prototype.hasOwnProperty.call(parameters, "json_schema") if (hasJsonSchema) { form.setFields([{name: jsonSchemaFieldPath, errors: []}]) - if (typeof settingsValues.json_schema === "string") { + if (typeof parameters.json_schema === "string") { try { - const parsed = JSON.parse(settingsValues.json_schema) + const parsed = JSON.parse(parameters.json_schema) if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { throw new Error() } - settingsValues.json_schema = parsed + parameters.json_schema = parsed } catch { form.setFields([ { @@ -315,9 +317,9 @@ const ConfigureEvaluator = ({ throw new Error("JSON schema must be a valid JSON object") } } else if ( - settingsValues.json_schema && - (typeof settingsValues.json_schema !== "object" || - Array.isArray(settingsValues.json_schema)) + parameters.json_schema && + (typeof parameters.json_schema !== "object" || + Array.isArray(parameters.json_schema)) ) { form.setFields([ { @@ -329,40 +331,43 @@ const ConfigureEvaluator = ({ } } - const data = { - ...values, - evaluator_key: selectedEvaluator!.key, - settings_values: settingsValues, + const existingParameters = editEvalEditValues?.data?.parameters || {} + const mergedParameters = {...existingParameters, ...parameters} + + const payload: CreateEvaluatorConfigData = { + name: values.name, + description: values.description, + tags: values.tags, + evaluator_key: selectedEvaluator.key, + parameters, } if (editMode) { - await updateEvaluatorConfig(editEvalEditValues?.id!, data) - - // Update atom with merged values - const updatedConfig = editEvalEditValues - ? { - ...editEvalEditValues, - ...data, - settings_values: settingsValues, - } - : null - if (updatedConfig) { - commitPlayground(updatedConfig) - } + const updatedEvaluator = await updateEvaluatorConfig(editEvalEditValues?.id!, { + id: editEvalEditValues?.id!, + name: values.name, + description: editEvalEditValues?.description, + tags: editEvalEditValues?.tags, + meta: editEvalEditValues?.meta, + flags: editEvalEditValues?.flags, + data: { + ...(editEvalEditValues?.data ?? {}), + parameters: mergedParameters, + }, + }) + + commitPlayground(updatedEvaluator) } else { - const response = await createEvaluatorConfig(appId, data) - const createdConfig = response?.data - - if (createdConfig) { - // Use commitPlayground to update state and switch to edit mode - commitPlayground(createdConfig) - if (uiVariant === "page" && createdConfig.id) { - await router.replace( - `${projectURL}/evaluators/configure/${encodeURIComponent( - createdConfig.id, - )}`, - ) - } + const createdConfig = await createEvaluatorConfig(appId, payload) + + // Use commitPlayground to update state and switch to edit mode + commitPlayground(createdConfig) + if (uiVariant === "page" && createdConfig.id) { + await router.replace( + `${projectURL}/evaluators/configure/${encodeURIComponent( + createdConfig.id, + )}`, + ) } } @@ -381,15 +386,15 @@ const ConfigureEvaluator = ({ form.resetFields() if (editMode && editEvalEditValues) { - // Load all values including nested settings_values + // Load all values including nested parameters form.setFieldsValue({ ...editEvalEditValues, - settings_values: editEvalEditValues.settings_values || {}, + parameters: editEvalEditValues.data?.parameters || {}, }) } else if (cloneConfig && editEvalEditValues) { - // When cloning, copy only settings_values and clear the name so user provides a new name + // When cloning, copy only parameters and clear the name so user provides a new name form.setFieldsValue({ - settings_values: editEvalEditValues.settings_values || {}, + parameters: editEvalEditValues.data?.parameters || {}, name: "", }) } else if (selectedEvaluator?.settings_template) { @@ -404,7 +409,7 @@ const ConfigureEvaluator = ({ } if (Object.keys(defaultSettings).length > 0) { form.setFieldsValue({ - settings_values: defaultSettings, + parameters: defaultSettings, }) } } @@ -556,7 +561,7 @@ const ConfigureEvaluator = ({ key={field.key} traceTree={traceTree} form={form} - name={["settings_values", field.key]} + name={["parameters", field.key]} /> ))} @@ -674,7 +679,7 @@ const ConfigureEvaluator = ({ key={field.key} traceTree={traceTree} form={form} - name={["settings_values", field.key]} + name={["parameters", field.key]} /> ))} diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts index 76b8c134c2..dcb15dcd42 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts @@ -18,7 +18,7 @@ import type {FormInstance} from "antd" import {atom} from "jotai" import {atomWithReset, atomWithStorage, RESET} from "jotai/utils" -import type {Evaluator, EvaluatorConfig, Variant} from "@/oss/lib/Types" +import type {Evaluator, SimpleEvaluator, Variant} from "@/oss/lib/Types" import {stringStorage} from "@/oss/state/utils/stringStorage" // ================================================================ @@ -84,7 +84,7 @@ export const playgroundIsCloneModeAtom = atom((get) => get(playgroundSessionAtom * - In edit mode: loaded from existing config * - In clone mode: copied from source config (with cleared name) */ -export const playgroundEditValuesAtom = atomWithReset(null) +export const playgroundEditValuesAtom = atomWithReset(null) // ================================================================ // FORM STATE @@ -95,7 +95,7 @@ export const playgroundEditValuesAtom = atomWithReset(nu * Allows DebugSection to read form values for running the evaluator * * This is set by ConfigureEvaluator when the form mounts - * and read by DebugSection to get current settings_values + * and read by DebugSection to get current parameters */ export const playgroundFormRefAtom = atom(null) @@ -179,7 +179,7 @@ export const initPlaygroundAtom = atom( set, payload: { evaluator: Evaluator - existingConfig?: EvaluatorConfig | null + existingConfig?: SimpleEvaluator | null mode?: PlaygroundMode }, ) => { @@ -226,7 +226,7 @@ export const resetPlaygroundAtom = atom(null, (get, set) => { * * @param savedConfig - The config returned from the API */ -export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: EvaluatorConfig) => { +export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: SimpleEvaluator) => { // Update edit values with saved config set(playgroundEditValuesAtom, savedConfig) @@ -280,7 +280,7 @@ export const openEvaluatorDrawerAtom = atom( set, payload: { evaluator: Evaluator - existingConfig?: EvaluatorConfig | null + existingConfig?: SimpleEvaluator | null mode?: PlaygroundMode }, ) => { diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx index 0ac235b386..c30bb3c1f1 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx @@ -5,11 +5,11 @@ import {Modal, Space, theme, Typography} from "antd" import {createUseStyles} from "react-jss" import {checkIfResourceValidForDeletion} from "@/oss/lib/evaluations/legacy" -import {EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import {deleteEvaluatorConfig} from "@/oss/services/evaluations/api" type DeleteModalProps = { - selectedEvalConfig: EvaluatorConfig + selectedEvalConfig: SimpleEvaluator onSuccess: () => void } & React.ComponentProps diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index f3c9434a38..72aaf034fc 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -7,18 +7,19 @@ import {useAtom} from "jotai" import {createUseStyles} from "react-jss" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {formatDay} from "@/oss/lib/helpers/dateTimeHelper" -import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import DeleteModal from "./DeleteModal" interface EvaluatorCardProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void } @@ -88,22 +89,21 @@ const EvaluatorCard = ({ const classes = useStyles() const evaluators = useAtom(evaluatorsAtom)[0] const [openDeleteModal, setOpenDeleteModal] = useState(false) - const [selectedDelEval, setSelectedDelEval] = useState(null) + const [selectedDelEval, setSelectedDelEval] = useState(null) return (
{evaluatorConfigs.length ? ( evaluatorConfigs.map((item) => { - const evaluator = evaluators.find((e) => e.key === item.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(item) + const evaluator = evaluators.find((e) => e.key === evaluatorKey) return ( { - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) + const selectedEval = evaluators.find((e) => e.key === evaluatorKey) if (selectedEval) { setEditMode(true) setSelectedEvaluator(selectedEval) @@ -130,7 +130,7 @@ const EvaluatorCard = ({ onClick: (e: any) => { e.domEvent.stopPropagation() const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setEditMode(true) @@ -147,7 +147,7 @@ const EvaluatorCard = ({ onClick: (e: any) => { e.domEvent.stopPropagation() const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setCloneConfig(true) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index 2e38bfd1c2..33c03a9f89 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -7,17 +7,18 @@ import {ColumnsType} from "antd/es/table" import {useAtom} from "jotai" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import DeleteModal from "./DeleteModal" interface EvaluatorListProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void } @@ -32,9 +33,9 @@ const EvaluatorList = ({ }: EvaluatorListProps) => { const evaluators = useAtom(evaluatorsAtom)[0] const [openDeleteModal, setOpenDeleteModal] = useState(false) - const [selectedDelEval, setSelectedDelEval] = useState(null) + const [selectedDelEval, setSelectedDelEval] = useState(null) - const columns: ColumnsType = [ + const columns: ColumnsType = [ // { // title: "Version", // dataIndex: "version", @@ -56,7 +57,8 @@ const EvaluatorList = ({ dataIndex: "type", key: "type", render: (_, record) => { - const evaluator = evaluators.find((item) => item.key === record.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = evaluators.find((item) => item.key === evaluatorKey) return {evaluator?.name} }, }, @@ -84,8 +86,9 @@ const EvaluatorList = ({ icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const evaluatorKey = resolveEvaluatorKey(record) const selectedEval = evaluators.find( - (e) => e.key === record.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setEditMode(true) @@ -101,8 +104,9 @@ const EvaluatorList = ({ icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const evaluatorKey = resolveEvaluatorKey(record) const selectedEval = evaluators.find( - (e) => e.key === record.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setCloneConfig(true) @@ -151,7 +155,8 @@ const EvaluatorList = ({ onRow={(record) => ({ style: {cursor: "pointer"}, onClick: () => { - const selectedEval = evaluators.find((e) => e.key === record.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(record) + const selectedEval = evaluators.find((e) => e.key === evaluatorKey) if (selectedEval) { setEditMode(true) setSelectedEvaluator(selectedEval) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index 60569766c2..564bc38df9 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -8,21 +8,22 @@ import {createUseStyles} from "react-jss" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" import {getEvaluatorTags} from "@/oss/lib/evaluations/legacy" -import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" +import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import {nonArchivedEvaluatorsAtom} from "@/oss/state/evaluators" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" interface EvaluatorsProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] handleOnCancel: () => void setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> fetchingEvalConfigs: boolean setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void setEvaluatorsDisplay: any evaluatorsDisplay: string @@ -95,10 +96,13 @@ const Evaluators = ({ const updatedEvaluatorConfigs = useMemo(() => { return evaluatorConfigs.map((config) => { - const matchingEvaluator = evaluators.find( - (evaluator) => evaluator.key === config.evaluator_key, + const evaluatorKey = resolveEvaluatorKey(config) + const matchingEvaluator = evaluators.find((evaluator) => evaluator.key === evaluatorKey) + if (!matchingEvaluator) return config + const nextTags = Array.from( + new Set([...(config.tags || []), ...(matchingEvaluator.tags || [])]), ) - return matchingEvaluator ? {...config, tags: matchingEvaluator.tags} : config + return {...config, tags: nextTags} }) }, [evaluatorConfigs, evaluators]) @@ -111,7 +115,7 @@ const Evaluators = ({ if (searchTerm) { filtered = filtered.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) } diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index b89da2ee19..c06202394c 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -9,7 +9,7 @@ import EnhancedModal from "@/oss/components/EnhancedUIs/Modal" import {useAppId} from "@/oss/hooks/useAppId" import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import ConfigureEvaluator from "./ConfigureEvaluator" import {initPlaygroundAtom, resetPlaygroundAtom} from "./ConfigureEvaluator/state/atoms" @@ -39,7 +39,7 @@ const EvaluatorsModal = ({ useFetchEvaluatorsData({appId: appId ?? ""}) const [editMode, setEditMode] = useState(false) const [cloneConfig, setCloneConfig] = useState(false) - const [editEvalEditValues, setEditEvalEditValues] = useState(null) + const [editEvalEditValues, setEditEvalEditValues] = useState(null) const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage<"card" | "list">( "evaluator_view", "list", diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx index 9687626c3c..d0281e7539 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx @@ -63,7 +63,7 @@ const OnlineEvaluationDrawer = ({open, onClose, onCreate}: OnlineEvaluationDrawe const filterColumns = useMemo(() => getFilterColumns(), []) const [filters, setFilters] = useAtom(onlineEvalFiltersAtom) const resetFilters = useSetAtom(resetOnlineEvalFiltersAtom) - // Load preview evaluators (with IDs) to map evaluator_config.evaluator_key -> evaluator.id + // Load preview evaluators (with IDs) to map config URI key -> evaluator.id const previewEvaluatorsSwr = useEvaluators({preview: true, queries: {is_human: false}}) const baseEvaluators = (baseEvaluatorsSwr.data as Evaluator[] | undefined) ?? [] const evaluators = useAtomValue(evaluatorConfigsAtom) diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts index a49787e814..0545163cae 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts @@ -47,18 +47,41 @@ const mergeEvaluatorWithConfig = ( ...configAny, } - const previewSettings = isPlainObject(evaluatorAny.settings_values) - ? (evaluatorAny.settings_values as Record) + const previewData = isPlainObject(evaluatorAny.data) + ? (evaluatorAny.data as Record) : undefined - const configSettings = isPlainObject(configAny.settings_values) - ? (configAny.settings_values as Record) + const configData = isPlainObject(configAny.data) + ? (configAny.data as Record) : undefined + if (previewData || configData) { + const mergedData: Record = { + ...(previewData ?? {}), + ...(configData ?? {}), + } - if (previewSettings || configSettings) { - merged.settings_values = { + const previewParameters = isPlainObject(previewData?.parameters) + ? (previewData?.parameters as Record) + : undefined + const configParameters = isPlainObject(configData?.parameters) + ? (configData?.parameters as Record) + : undefined + const previewSettings = isPlainObject(evaluatorAny.settings_values) + ? (evaluatorAny.settings_values as Record) + : undefined + const configSettings = isPlainObject(configAny.settings_values) + ? (configAny.settings_values as Record) + : undefined + const mergedParameters = { + ...(previewParameters ?? {}), ...(previewSettings ?? {}), + ...(configParameters ?? {}), ...(configSettings ?? {}), } + if (Object.keys(mergedParameters).length) { + mergedData.parameters = mergedParameters + } + + merged.data = mergedData } return merged as EvaluatorPreviewDto diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx index af624b9f83..d5e724dcef 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx @@ -2,8 +2,9 @@ import {useMemo} from "react" import {SelectProps} from "antd" +import {getEvaluatorParameters, resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" -import type {Evaluator} from "@/oss/lib/Types" +import type {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import { ALLOWED_ONLINE_EVALUATOR_KEYS, @@ -13,7 +14,7 @@ import { import {capitalize, collectEvaluatorCandidates} from "../utils/evaluatorDetails" interface UseEvaluatorSelectionParams { - evaluators: any[] + evaluators: SimpleEvaluator[] selectedEvaluatorId: string | undefined previewEvaluators: EvaluatorPreviewDto[] baseEvaluators: Evaluator[] @@ -21,16 +22,17 @@ interface UseEvaluatorSelectionParams { interface EvaluatorSelectionResult { evaluatorOptions: SelectProps["options"] - selectedEvaluatorConfig?: any + selectedEvaluatorConfig?: SimpleEvaluator matchedPreviewEvaluator?: EvaluatorPreviewDto evaluatorTypeLookup: Map } -const buildEvaluatorOptions = (configs: any[]): SelectProps["options"] => +const buildEvaluatorOptions = (configs: SimpleEvaluator[]): SelectProps["options"] => (configs || []).map((cfg: any) => { const iconSrc = (cfg?.icon_url && (cfg.icon_url.src || cfg.icon_url)) || undefined const displayName = cfg?.name || "" - const searchable = [displayName, cfg?.evaluator_key, cfg?.id] + const evaluatorKey = resolveEvaluatorKey(cfg) + const searchable = [displayName, evaluatorKey, cfg?.id, cfg?.slug, cfg?.data?.uri] .map((item) => { if (item === undefined || item === null) return undefined const text = String(item).trim() @@ -61,6 +63,7 @@ const buildPreviewLookup = (previewEvaluators: EvaluatorPreviewDto[]) => { const map = new Map() previewEvaluators.forEach((evaluator) => { const rawKey = + resolveEvaluatorKey(evaluator as any) || (evaluator as any)?.evaluator_key || (evaluator as any)?.flags?.evaluator_key || (evaluator as any)?.meta?.evaluator_key || @@ -122,13 +125,14 @@ export const useEvaluatorSelection = ({ const allowedEvaluators = useMemo(() => { if (!evaluators?.length) return [] - return evaluators.filter((config: any) => { + return evaluators.filter((config: SimpleEvaluator) => { if (!config) return false + const evaluatorKey = resolveEvaluatorKey(config) const candidates = collectEvaluatorCandidates( - config?.evaluator_key, - (config as any)?.slug, + evaluatorKey, + config?.slug, config?.name, - config?.key, + (config as any)?.key, config?.meta?.evaluator_key, config?.meta?.key, ) @@ -141,13 +145,13 @@ export const useEvaluatorSelection = ({ if (!allowedEvaluators.length) return [] if (!ENABLE_CORRECT_ANSWER_KEY_FILTER) return allowedEvaluators const requiringKey = evaluatorsRequiringCorrectAnswerKey ?? new Set() - return allowedEvaluators.filter((config: any) => { + return allowedEvaluators.filter((config: SimpleEvaluator) => { if (!config) return false - const evaluatorKey = config?.evaluator_key + const evaluatorKey = resolveEvaluatorKey(config) if (evaluatorKey && requiringKey.has(evaluatorKey)) { return false } - const settingsValues = config?.settings_values || {} + const settingsValues = getEvaluatorParameters(config) const requiresCorrectAnswerKey = Object.entries(settingsValues).some(([key, value]) => { if (!key) return false const normalizedKey = key.toLowerCase() @@ -176,7 +180,7 @@ export const useEvaluatorSelection = ({ const previewLookup = useMemo(() => buildPreviewLookup(previewEvaluators), [previewEvaluators]) const matchedPreviewEvaluator = useMemo(() => { - const key = (selectedEvaluatorConfig as any)?.evaluator_key as string | undefined + const key = resolveEvaluatorKey(selectedEvaluatorConfig) if (!key) return undefined return previewLookup.get(key.toLowerCase()) }, [selectedEvaluatorConfig, previewLookup]) diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts index fb54e0978b..3cef385d5d 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts @@ -3,6 +3,7 @@ import {useMemo} from "react" import {useAtomValue} from "jotai" import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import useEvaluatorConfigs from "@/oss/lib/hooks/useEvaluatorConfigs" import {EVALUATOR_CATEGORY_LABEL_MAP} from "../constants" @@ -25,6 +26,7 @@ export const useEvaluatorTypeFromConfigs = ({ } const candidates = collectEvaluatorCandidates( + resolveEvaluatorKey(evaluator as any), (evaluator as any)?.slug, (evaluator as any)?.key, (evaluator as any)?.meta?.evaluator_key, @@ -32,7 +34,7 @@ export const useEvaluatorTypeFromConfigs = ({ ) const match = configs.find((cfg) => { - const key = (cfg?.evaluator_key || cfg?.name || cfg?.id || "").toString().trim() + const key = (resolveEvaluatorKey(cfg) || cfg?.name || cfg?.id || "").toString().trim() if (!key) return false const lower = key.toLowerCase() if (candidates.includes(lower)) return true @@ -63,7 +65,7 @@ export const useEvaluatorTypeFromConfigs = ({ // 2) Infer label by scanning evaluator_key/name tokens for known category slugs const categorySlugs = Object.keys(EVALUATOR_CATEGORY_LABEL_MAP || {}) const keyTokens = [ - (match as any)?.evaluator_key, + resolveEvaluatorKey(match), (match as any)?.name, (evaluator as any)?.key, (evaluator as any)?.name, diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts index fddb7511af..fdbd26e16f 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts @@ -1,3 +1,4 @@ +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" import { @@ -126,6 +127,7 @@ export const extractEvaluatorType = ( } const candidates = collectEvaluatorCandidates( + resolveEvaluatorKey(evaluator as any), (evaluator as any)?.slug, (evaluator as any)?.key, (evaluator as any)?.name, @@ -290,8 +292,8 @@ export const extractParameterList = (evaluator?: EvaluatorPreviewDto): Parameter // Support both simple preview artifacts and workflow evaluators const parameterSources = [ - (evaluator as any)?.settings_values, (evaluator as any)?.data?.parameters, + (evaluator as any)?.settings_values, (evaluator as any)?.data?.service?.configuration?.parameters, (evaluator as any)?.data?.configuration?.parameters, ] @@ -359,8 +361,8 @@ export const extractModelName = (evaluator?: EvaluatorPreviewDto) => { } const sources = [ - (evaluator as any)?.settings_values, (evaluator as any)?.data?.parameters, + (evaluator as any)?.settings_values, (evaluator as any)?.data?.service?.configuration, (evaluator as any)?.data?.service?.configuration?.parameters, (evaluator as any)?.data?.configuration, @@ -660,7 +662,8 @@ const normalizeMessageContent = ( export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPreviewSection[] => { if (!evaluator) return [] const data = (evaluator as any)?.data ?? {} - const settings = (evaluator as any)?.settings_values + const parameters = data?.parameters + const settings = parameters ?? (evaluator as any)?.settings_values const agConfig = data?.parameters?.ag_config ?? data?.parameters?.agConfig const messages = findFirstMessages(settings) ?? @@ -728,7 +731,6 @@ export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPr const promptSources = [ settings, - data?.parameters, data?.service?.configuration?.parameters, data?.configuration?.parameters, ] diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts index 74f6f31a51..2bd357fc60 100644 --- a/web/oss/src/lib/Types.ts +++ b/web/oss/src/lib/Types.ts @@ -870,6 +870,76 @@ export interface Evaluator { archived?: boolean } +export interface SimpleEvaluatorData { + version?: string + uri?: string + url?: string + headers?: Record + schemas?: Record + script?: {content?: string; runtime?: string} + parameters?: Record + service?: Record + configuration?: Record +} + +export interface SimpleEvaluatorFlags { + is_custom?: boolean + is_evaluator?: boolean + is_human?: boolean + requires_llm_api_keys?: boolean + evaluator_key?: string + color?: string +} + +export interface SimpleEvaluator { + id: string + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData + created_at?: string + updated_at?: string + deleted_at?: string | null + created_by_id?: string + updated_by_id?: string + deleted_by_id?: string + color?: string + icon_url?: string | StaticImageData +} + +export interface SimpleEvaluatorCreate { + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorEdit { + id: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorResponse { + count: number + evaluator: SimpleEvaluator | null +} + +export interface SimpleEvaluatorsResponse { + count: number + evaluators: SimpleEvaluator[] +} + export interface EvaluatorConfig { id: string evaluator_key: string diff --git a/web/oss/src/lib/atoms/evaluation.ts b/web/oss/src/lib/atoms/evaluation.ts index 323dde41cb..1fbc0039ad 100644 --- a/web/oss/src/lib/atoms/evaluation.ts +++ b/web/oss/src/lib/atoms/evaluation.ts @@ -1,6 +1,6 @@ import {atom} from "jotai" -import {Evaluation, EvaluationScenario, Evaluator, EvaluatorConfig} from "../Types" +import {Evaluation, EvaluationScenario, Evaluator, SimpleEvaluator} from "../Types" export const evaluationAtom = atom(undefined) @@ -8,4 +8,4 @@ export const evaluationScenariosAtom = atom([]) export const evaluatorsAtom = atom([]) -export const evaluatorConfigsAtom = atom([]) +export const evaluatorConfigsAtom = atom([]) diff --git a/web/oss/src/lib/evaluators/utils.ts b/web/oss/src/lib/evaluators/utils.ts new file mode 100644 index 0000000000..e21d98a62e --- /dev/null +++ b/web/oss/src/lib/evaluators/utils.ts @@ -0,0 +1,80 @@ +import type {SimpleEvaluator, SimpleEvaluatorData} from "@/oss/lib/Types" + +const normalizeSlugBase = (value?: string | null) => + String(value ?? "") + .trim() + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + +const trimVersionSuffix = (value: string) => value.replace(/-v\d+$/i, "") + +export const extractEvaluatorKeyFromUri = (uri?: string | null): string | undefined => { + if (!uri) return undefined + const trimmed = uri.trim() + if (!trimmed) return undefined + + const builtinMatch = trimmed.match(/^agenta:builtin:([^:]+)(:|$)/i) + if (builtinMatch?.[1]) { + return trimVersionSuffix(builtinMatch[1]) + } + + const parts = trimmed.split(":").filter(Boolean) + if (parts.length >= 3 && parts[2]) { + return trimVersionSuffix(parts[2]) + } + + const slashParts = trimmed.split("/").filter(Boolean) + const lastSegment = slashParts[slashParts.length - 1] + if (lastSegment) { + return trimVersionSuffix(lastSegment) + } + + return undefined +} + +export const resolveEvaluatorKey = ( + evaluator?: Partial | null, +): string | undefined => { + if (!evaluator) return undefined + + const candidate = + extractEvaluatorKeyFromUri(evaluator.data?.uri) || + (typeof (evaluator as any)?.evaluator_key === "string" + ? (evaluator as any).evaluator_key + : undefined) || + (typeof evaluator.meta?.evaluator_key === "string" + ? evaluator.meta.evaluator_key + : undefined) || + (typeof evaluator.flags?.evaluator_key === "string" + ? evaluator.flags.evaluator_key + : undefined) || + (typeof (evaluator as any)?.key === "string" ? (evaluator as any).key : undefined) + + return candidate ? String(candidate).trim() : undefined +} + +export const buildEvaluatorUri = (evaluatorKey: string, version = "v0") => + `agenta:builtin:${evaluatorKey}:${version}` + +export const buildEvaluatorSlug = (name?: string | null) => { + const base = normalizeSlugBase(name) || "evaluator" + const suffix = Math.random().toString(36).slice(2, 8) + const maxBaseLength = Math.max(1, 50 - suffix.length - 1) + const trimmedBase = base.slice(0, maxBaseLength) + return `${trimmedBase}-${suffix}` +} + +export const mergeEvaluatorData = ( + base?: SimpleEvaluatorData | null, + updates?: Partial | null, +): SimpleEvaluatorData | undefined => { + if (!base && !updates) return undefined + return { + ...(base ?? {}), + ...(updates ?? {}), + } +} + +export const getEvaluatorParameters = (evaluator?: Partial | null) => + (evaluator?.data?.parameters as Record) || {} diff --git a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts index 3765eb6677..998f65459e 100644 --- a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts +++ b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts @@ -6,11 +6,11 @@ import {SWRConfiguration} from "swr" import {useAppId} from "@/oss/hooks/useAppId" import {evaluatorConfigsQueryAtomFamily} from "@/oss/state/evaluators" -import {EvaluatorConfig} from "../../Types" +import {SimpleEvaluator} from "../../Types" type EvaluatorConfigResult = Preview extends true ? undefined - : EvaluatorConfig[] + : SimpleEvaluator[] type EvaluatorConfigsOptions = { preview?: Preview diff --git a/web/oss/src/services/evaluations/api/index.ts b/web/oss/src/services/evaluations/api/index.ts index 43bfdb3ca8..9702d501b5 100644 --- a/web/oss/src/services/evaluations/api/index.ts +++ b/web/oss/src/services/evaluations/api/index.ts @@ -17,7 +17,7 @@ export { createEvaluatorConfig, updateEvaluatorConfig, deleteEvaluatorConfig, - type CreateEvaluationConfigData, + type CreateEvaluatorConfigData, } from "@/oss/services/evaluators" //Prefix convention: diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts index 2a9bb15de7..9a85ae7c8c 100644 --- a/web/oss/src/services/evaluators/index.ts +++ b/web/oss/src/services/evaluators/index.ts @@ -1,9 +1,21 @@ import axios from "@/oss/lib/api/assets/axiosConfig" +import { + buildEvaluatorSlug, + buildEvaluatorUri, + resolveEvaluatorKey, +} from "@/oss/lib/evaluators/utils" import {getAgentaApiUrl} from "@/oss/lib/helpers/api" import {getTagColors} from "@/oss/lib/helpers/colors" import {isDemo, stringToNumberInRange} from "@/oss/lib/helpers/utils" import {EvaluatorResponseDto} from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import { + Evaluator, + SimpleEvaluator, + SimpleEvaluatorCreate, + SimpleEvaluatorEdit, + SimpleEvaluatorResponse, + SimpleEvaluatorsResponse, +} from "@/oss/lib/Types" import aiImg from "@/oss/media/artificial-intelligence.png" import bracketCurlyImg from "@/oss/media/bracket-curly.png" import codeImg from "@/oss/media/browser.png" @@ -48,7 +60,7 @@ export const updateEvaluator = async ( } } -export const fetchEvaluatorById = async (evaluatorId: string) => { +export const fetchEvaluatorById = async (evaluatorId: string): Promise => { const {projectId} = getProjectValues() if (!projectId) { return null @@ -59,7 +71,7 @@ export const fetchEvaluatorById = async (evaluatorId: string) => { ) const payload = (response?.data as any)?.evaluator ?? response?.data ?? null if (!payload) return null - return payload as EvaluatorResponseDto<"response">["evaluator"] + return decorateSimpleEvaluator(payload as SimpleEvaluator) } const evaluatorIconsMap = { @@ -103,58 +115,112 @@ export const fetchAllEvaluators = async (includeArchived = false) => { } // Evaluator Configs +function decorateSimpleEvaluator(evaluator: SimpleEvaluator) { + const tagColors = getTagColors() + const evaluatorKey = resolveEvaluatorKey(evaluator) + if (!evaluatorKey) return evaluator + + return { + ...evaluator, + icon_url: evaluatorIconsMap[evaluatorKey as keyof typeof evaluatorIconsMap], + color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)], + } +} + export const fetchAllEvaluatorConfigs = async ( appId?: string | null, projectIdOverride?: string | null, -) => { - const tagColors = getTagColors() +): Promise => { const {projectId: projectIdFromStore} = getProjectValues() const projectId = projectIdOverride ?? projectIdFromStore + void appId if (!projectId) { - return [] as EvaluatorConfig[] + return [] as SimpleEvaluator[] } - const response = await axios.get("/evaluators/configs", { - params: { - project_id: projectId, - ...(appId ? {app_id: appId} : {}), + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, + { + evaluator: { + flags: { + is_evaluator: true, + is_human: false, + }, + }, + include_archived: false, }, - }) - const evaluatorConfigs = (response.data || []).map((item: EvaluatorConfig) => ({ - ...item, - icon_url: evaluatorIconsMap[item.evaluator_key as keyof typeof evaluatorIconsMap], - color: tagColors[stringToNumberInRange(item.evaluator_key, 0, tagColors.length - 1)], - })) as EvaluatorConfig[] - return evaluatorConfigs + ) + + const evaluators = response.data?.evaluators ?? [] + return evaluators.filter((item) => !item.deleted_at).map(decorateSimpleEvaluator) +} + +export interface CreateEvaluatorConfigData { + name: string + evaluator_key: string + parameters: Record + tags?: string[] + description?: string } -export type CreateEvaluationConfigData = Omit export const createEvaluatorConfig = async ( _appId: string | null | undefined, - config: CreateEvaluationConfigData, -) => { + config: CreateEvaluatorConfigData, +): Promise => { const {projectId} = getProjectValues() void _appId - return axios.post(`/evaluators/configs?project_id=${projectId}`, { - ...config, - }) + const payload: SimpleEvaluatorCreate = { + slug: buildEvaluatorSlug(config.name), + name: config.name, + description: config.description, + tags: config.tags, + flags: {is_evaluator: true, is_human: false}, + data: { + uri: buildEvaluatorUri(config.evaluator_key), + parameters: config.parameters, + }, + } + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`, + {evaluator: payload}, + ) + + const evaluator = response.data?.evaluator + if (!evaluator) { + throw new Error("Failed to create evaluator") + } + + return decorateSimpleEvaluator(evaluator) } export const updateEvaluatorConfig = async ( configId: string, - config: Partial, -) => { + config: SimpleEvaluatorEdit, +): Promise => { const {projectId} = getProjectValues() - return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config) + const response = await axios.put( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`, + {evaluator: {...config, id: configId}}, + ) + + const evaluator = response.data?.evaluator + if (!evaluator) { + throw new Error("Failed to update evaluator") + } + + return decorateSimpleEvaluator(evaluator) } export const deleteEvaluatorConfig = async (configId: string) => { const {projectId} = getProjectValues() - return axios.delete(`/evaluators/configs/${configId}?project_id=${projectId}`) + return axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`, + ) } export const deleteHumanEvaluator = async (evaluatorId: string) => { diff --git a/web/oss/src/state/evaluators/atoms.ts b/web/oss/src/state/evaluators/atoms.ts index 24f390e884..5c6b34c9e2 100644 --- a/web/oss/src/state/evaluators/atoms.ts +++ b/web/oss/src/state/evaluators/atoms.ts @@ -5,6 +5,7 @@ import {atomWithQuery} from "jotai-tanstack-query" import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms" import axios from "@/oss/lib/api/assets/axiosConfig" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" +import {extractEvaluatorKeyFromUri} from "@/oss/lib/evaluators/utils" import {transformApiData} from "@/oss/lib/hooks/useAnnotations/assets/transformer" import { EvaluatorDto, @@ -13,7 +14,7 @@ import { EvaluatorRevisionsResponseDto, EvaluatorsResponseDto, } from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/oss/services/evaluators" import {selectedAppIdAtom} from "@/oss/state/app" import {selectedOrgAtom} from "@/oss/state/org" @@ -26,16 +27,15 @@ import {EvaluatorConfigsParams, EvaluatorsParams} from "./types" const extractKeyFromUri = (uri: unknown): string | undefined => { if (typeof uri !== "string") return undefined - const match = uri.match(/[:/](auto_[a-z0-9_]+)/i) - if (match?.[1]) return match[1] - const parts = uri.split(":").filter(Boolean) - if (parts.length) { - const candidate = parts[parts.length - 1] - if (candidate) { - return candidate.replace(/-v\d+$/i, "") - } - } - return undefined + return ( + extractEvaluatorKeyFromUri(uri) || + uri.match(/[:/](auto_[a-z0-9_]+)/i)?.[1] || + uri + .split(":") + .filter(Boolean) + .slice(-1)[0] + ?.replace(/-v\d+$/i, "") + ) } const isPlainObject = (value: unknown): value is Record => { @@ -102,7 +102,7 @@ const extractRequiresLlmApiKeys = (source: unknown): boolean | undefined => { export const evaluatorConfigsQueryAtomFamily = atomFamily( ({projectId: overrideProjectId, appId: overrideAppId, preview}: EvaluatorConfigsParams = {}) => - atomWithQuery((get) => { + atomWithQuery((get) => { const projectId = overrideProjectId || get(projectIdAtom) const appId = overrideAppId || get(selectedAppIdAtom) const user = get(userAtom) as {id?: string} | null