diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py new file mode 100644 index 000000000..449768b38 --- /dev/null +++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py @@ -0,0 +1,60 @@ +"""add config in evals run table + +Revision ID: 041 +Revises: 040 +Create Date: 2025-12-15 14:03:22.082746 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config_id", + sa.Uuid(), + nullable=True, + comment="Reference to the stored config used", + ), + ) + op.add_column( + "evaluation_run", + sa.Column( + "config_version", + sa.Integer(), + nullable=True, + comment="Version of the config used", + ), + ) + op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) + op.drop_column("evaluation_run", "config") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + comment="Evaluation configuration (model, instructions, etc.)", + ), + ) + op.drop_constraint(None, "evaluation_run", type_="foreignkey") + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "config_id") + # ### end Alembic commands ### diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index d66ff71ce..1ce42742a 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse: @router.post( - "/", + "", description=load_description("evaluation/upload_dataset.md"), response_model=APIResponse[DatasetUploadResponse], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -87,7 +87,7 @@ async def upload_dataset( @router.get( - "/", + "", description=load_description("evaluation/list_datasets.md"), response_model=APIResponse[list[DatasetUploadResponse]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index d40a88a1a..fe09edcec 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -1,6 +1,7 @@ """Evaluation run API routes.""" import logging +from uuid import UUID from fastapi import ( APIRouter, @@ -29,7 +30,7 @@ @router.post( - "/", + "", description=load_description("evaluation/create_evaluation.md"), response_model=APIResponse[EvaluationRunPublic], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -41,19 +42,16 @@ def evaluate( experiment_name: str = Body( ..., description="Name for this evaluation experiment/run" ), - config: dict = Body(default_factory=dict, description="Evaluation configuration"), - assistant_id: str - | None = Body( - None, description="Optional assistant ID to fetch configuration from" - ), + config_id: UUID = Body(..., description="Stored config ID"), + config_version: int = Body(..., ge=1, description="Stored config version"), ) -> APIResponse[EvaluationRunPublic]: """Start an evaluation run.""" eval_run = start_evaluation( session=_session, dataset_id=dataset_id, experiment_name=experiment_name, - config=config, - assistant_id=assistant_id, + config_id=config_id, + config_version=config_version, organization_id=auth_context.organization_.id, project_id=auth_context.project_.id, ) @@ -68,7 +66,7 @@ def evaluate( @router.get( - "/", + "", description=load_description("evaluation/list_evaluations.md"), response_model=APIResponse[list[EvaluationRunPublic]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index e667dcbb7..906c12ee9 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -5,6 +5,7 @@ create_evaluation_run, get_evaluation_run_by_id, list_evaluation_runs, + resolve_model_from_config, save_score, ) from app.crud.evaluations.cron import ( @@ -43,3 +44,44 @@ TraceData, TraceScore, ) + +__all__ = [ + # Core + "create_evaluation_run", + "get_evaluation_run_by_id", + "list_evaluation_runs", + "resolve_model_from_config", + "save_score", + # Cron + "process_all_pending_evaluations", + "process_all_pending_evaluations_sync", + # Dataset + "create_evaluation_dataset", + "delete_dataset", + "get_dataset_by_id", + "list_datasets", + "upload_csv_to_object_store", + # Batch + "start_evaluation_batch", + # Processing + "check_and_process_evaluation", + "poll_all_pending_evaluations", + "process_completed_embedding_batch", + "process_completed_evaluation", + # Embeddings + "calculate_average_similarity", + "calculate_cosine_similarity", + "start_embedding_batch", + # Langfuse + "create_langfuse_dataset_run", + "fetch_trace_scores_from_langfuse", + "update_traces_with_cosine_scores", + "upload_dataset_to_langfuse", + # Score types + "CategoricalSummaryScore", + "EvaluationScore", + "NumericSummaryScore", + "SummaryScore", + "TraceData", + "TraceScore", +] diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index e880d7d0c..b23a31987 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -16,6 +16,7 @@ from app.core.batch import OpenAIBatchProvider, start_batch_job from app.models import EvaluationRun +from app.models.llm.request import KaapiLLMParams logger = logging.getLogger(__name__) @@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str, def build_evaluation_jsonl( - dataset_items: list[dict[str, Any]], config: dict[str, Any] + dataset_items: list[dict[str, Any]], config: KaapiLLMParams ) -> list[dict[str, Any]]: """ Build JSONL data for evaluation batch using OpenAI Responses API. @@ -88,7 +89,6 @@ def build_evaluation_jsonl( List of dictionaries (JSONL data) """ jsonl_data = [] - for item in dataset_items: # Extract question from input question = item["input"].get("question", "") @@ -105,7 +105,18 @@ def build_evaluation_jsonl( "method": "POST", "url": "/v1/responses", "body": { - **config, # Use config as-is + # Use config as-is + "model": config.model, + "instructions": config.instructions, + "temperature": config.temperature, + "reasoning": {"effort": config.reasoning} if config.reasoning else None, + "tools": [ + { + "type": "file_search", + "vector_store_ids": config.knowledge_base_ids, + "max_num_results": config.max_num_results or 20, + } + ], "input": question, # Add input from dataset }, } @@ -119,7 +130,7 @@ def start_evaluation_batch( openai_client: OpenAI, session: Session, eval_run: EvaluationRun, - config: dict[str, Any], + config: KaapiLLMParams, ) -> EvaluationRun: """ Fetch data, build JSONL, and start evaluation batch. @@ -166,7 +177,7 @@ def start_evaluation_batch( "description": f"Evaluation: {eval_run.run_name}", "completion_window": "24h", # Store complete config for reference - "evaluation_config": config, + "evaluation_config": config.model_dump(exclude_none=True), } # Step 5: Start batch job using generic infrastructure diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 33b6777f3..8fb06a62d 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,12 +1,16 @@ import logging +from uuid import UUID from langfuse import Langfuse from sqlmodel import Session, select from app.core.util import now +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.crud.evaluations.score import EvaluationScore from app.models import EvaluationRun +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob logger = logging.getLogger(__name__) @@ -16,7 +20,8 @@ def create_evaluation_run( run_name: str, dataset_name: str, dataset_id: int, - config: dict, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -28,7 +33,8 @@ def create_evaluation_run( run_name: Name of the evaluation run/experiment dataset_name: Name of the dataset being used dataset_id: ID of the dataset - config: Configuration dict for the evaluation + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -39,7 +45,8 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, status="pending", organization_id=organization_id, project_id=project_id, @@ -56,7 +63,10 @@ def create_evaluation_run( logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) raise - logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") + logger.info( + f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, " + f"config_id={config_id}, config_version={config_version}" + ) return eval_run @@ -311,3 +321,47 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + + +def resolve_model_from_config( + session: Session, + eval_run: EvaluationRun, +) -> str: + """ + Resolve the model name from the evaluation run's config. + + Args: + session: Database session + eval_run: EvaluationRun instance + + Returns: + Model name from config + + Raises: + ValueError: If config is missing, invalid, or has no model + """ + if not eval_run.config_id or not eval_run.config_version: + raise ValueError( + f"Evaluation run {eval_run.id} has no config reference " + f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" + ) + + config_version_crud = ConfigVersionCrud( + session=session, + config_id=eval_run.config_id, + project_id=eval_run.project_id, + ) + + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version), + ) + + if error or config is None: + raise ValueError( + f"Config resolution failed for evaluation {eval_run.id} " + f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" + ) + + model = config.completion.params.model + return model diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 17ead39ab..20fabfd1a 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -363,19 +363,7 @@ def start_embedding_batch( logger.info(f"Starting embedding batch for evaluation run {eval_run.id}") # Get embedding model from config (default: text-embedding-3-large) - embedding_model = eval_run.config.get( - "embedding_model", "text-embedding-3-large" - ) - - # Validate and fallback to default if invalid - try: - validate_embedding_model(embedding_model) - except ValueError as e: - logger.warning( - f"Invalid embedding model '{embedding_model}' in config: {e}. " - f"Falling back to text-embedding-3-large" - ) - embedding_model = "text-embedding-3-large" + embedding_model = "text-embedding-3-large" # Step 1: Build embedding JSONL with trace_ids jsonl_data = build_embedding_jsonl( diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 076ac9f32..2c7f8648b 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -26,7 +26,7 @@ upload_batch_results_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run +from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( calculate_average_similarity, parse_embedding_results, @@ -254,16 +254,16 @@ async def process_completed_evaluation( if not results: raise ValueError("No valid results found in batch output") - # Extract model from config for cost tracking - model = eval_run.config.get("model") if eval_run.config else None - # Step 5: Create Langfuse dataset run with traces + # Use model stored at creation time for cost tracking + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, + model=model, run_name=eval_run.run_name, results=results, - model=model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27e..6ae4542fb 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Optional +from uuid import UUID from pydantic import BaseModel, Field from sqlalchemy import Column, Index, Text, UniqueConstraint @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Name of the Langfuse dataset used"}, ) - # Config field - dict requires sa_column - config: dict[str, Any] = SQLField( - default_factory=dict, - sa_column=Column( - JSONB, - nullable=False, - comment="Evaluation configuration (model, instructions, etc.)", - ), - description="Evaluation configuration", + config_id: UUID = SQLField( + foreign_key="config.id", + nullable=True, + description="Reference to the stored config used for this evaluation", + sa_column_kwargs={"comment": "Reference to the stored config used"}, + ) + config_version: int = SQLField( + nullable=True, + ge=1, + description="Version of the config used for this evaluation", + sa_column_kwargs={"comment": "Version of the config used"}, ) # Dataset reference @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config: dict[str, Any] + config_id: UUID | None + config_version: int | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None diff --git a/backend/app/services/evaluations/__init__.py b/backend/app/services/evaluations/__init__.py index 62201b426..92d88fe0b 100644 --- a/backend/app/services/evaluations/__init__.py +++ b/backend/app/services/evaluations/__init__.py @@ -2,7 +2,6 @@ from app.services.evaluations.dataset import upload_dataset from app.services.evaluations.evaluation import ( - build_evaluation_config, get_evaluation_with_scores, start_evaluation, ) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 4c1a5de74..2ea4e8caa 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -1,11 +1,12 @@ """Evaluation run orchestration service.""" import logging +from uuid import UUID from fastapi import HTTPException from sqlmodel import Session -from app.crud.assistants import get_assistant_by_id +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations import ( create_evaluation_run, fetch_trace_scores_from_langfuse, @@ -15,94 +16,20 @@ start_evaluation_batch, ) from app.models.evaluation import EvaluationRun +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob +from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client logger = logging.getLogger(__name__) -def build_evaluation_config( - session: Session, - config: dict, - assistant_id: str | None, - project_id: int, -) -> dict: - """ - Build evaluation configuration from assistant or provided config. - - If assistant_id is provided, fetch assistant and merge with config. - Config values take precedence over assistant values. - - Args: - session: Database session - config: Provided configuration dict - assistant_id: Optional assistant ID to fetch configuration from - project_id: Project ID for assistant lookup - - Returns: - Complete evaluation configuration dict - - Raises: - HTTPException: If assistant not found or model missing - """ - if assistant_id: - assistant = get_assistant_by_id( - session=session, - assistant_id=assistant_id, - project_id=project_id, - ) - - if not assistant: - raise HTTPException( - status_code=404, detail=f"Assistant {assistant_id} not found" - ) - - logger.info( - f"[build_evaluation_config] Found assistant in DB | id={assistant.id} | " - f"model={assistant.model} | instructions=" - f"{assistant.instructions[:50] if assistant.instructions else 'None'}..." - ) - - # Build config from assistant (use provided config values to override if present) - merged_config = { - "model": config.get("model", assistant.model), - "instructions": config.get("instructions", assistant.instructions), - "temperature": config.get("temperature", assistant.temperature), - } - - # Add tools if vector stores are available - vector_store_ids = config.get( - "vector_store_ids", assistant.vector_store_ids or [] - ) - if vector_store_ids and len(vector_store_ids) > 0: - merged_config["tools"] = [ - { - "type": "file_search", - "vector_store_ids": vector_store_ids, - } - ] - - logger.info("[build_evaluation_config] Using config from assistant") - return merged_config - - # Using provided config directly - logger.info("[build_evaluation_config] Using provided config directly") - - # Validate that config has minimum required fields - if not config.get("model"): - raise HTTPException( - status_code=400, - detail="Config must include 'model' when assistant_id is not provided", - ) - - return config - - def start_evaluation( session: Session, dataset_id: int, experiment_name: str, - config: dict, - assistant_id: str | None, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -111,7 +38,7 @@ def start_evaluation( Steps: 1. Validate dataset exists and has Langfuse ID - 2. Build config (from assistant or direct) + 2. Resolve config from stored config management 3. Create evaluation run record 4. Start batch processing @@ -119,8 +46,8 @@ def start_evaluation( session: Database session dataset_id: ID of the evaluation dataset experiment_name: Name for this evaluation experiment/run - config: Evaluation configuration - assistant_id: Optional assistant ID to fetch configuration from + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -128,16 +55,17 @@ def start_evaluation( EvaluationRun instance Raises: - HTTPException: If dataset not found or evaluation fails to start + HTTPException: If dataset not found, config invalid, or evaluation fails to start """ logger.info( f"[start_evaluation] Starting evaluation | experiment_name={experiment_name} | " f"dataset_id={dataset_id} | " f"org_id={organization_id} | " - f"assistant_id={assistant_id} | " - f"config_keys={list(config.keys())}" + f"config_id={config_id} | " + f"config_version={config_version}" ) + # Step 1: Fetch dataset from database dataset = get_dataset_by_id( session=session, dataset_id=dataset_id, @@ -165,13 +93,31 @@ def start_evaluation( "Please ensure Langfuse credentials were configured when the dataset was created.", ) - eval_config = build_evaluation_config( - session=session, - config=config, - assistant_id=assistant_id, - project_id=project_id, + # Step 2: Resolve config from stored config management + config_version_crud = ConfigVersionCrud( + session=session, config_id=config_id, project_id=project_id + ) + + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=config_id, version=config_version), + ) + if error: + raise HTTPException( + status_code=400, + detail=f"Failed to resolve config from stored config: {error}", + ) + elif config.completion.provider != LLMProvider.OPENAI: + raise HTTPException( + status_code=422, + detail="Only 'openai' provider is supported for evaluation configs", + ) + + logger.info( + "[start_evaluation] Successfully resolved config from config management" ) + # Get API clients openai_client = get_openai_client( session=session, org_id=organization_id, @@ -183,23 +129,26 @@ def start_evaluation( project_id=project_id, ) + # Step 3: Create EvaluationRun record with config references eval_run = create_evaluation_run( session=session, run_name=experiment_name, dataset_name=dataset.name, dataset_id=dataset_id, - config=eval_config, + config_id=config_id, + config_version=config_version, organization_id=organization_id, project_id=project_id, ) + # Step 4: Start the batch evaluation try: eval_run = start_evaluation_batch( langfuse=langfuse, openai_client=openai_client, session=session, eval_run=eval_run, - config=eval_config, + config=config.completion.params, ) logger.info( diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py index a5cfb4bb8..7fb8d79f9 100644 --- a/backend/app/services/llm/providers/registry.py +++ b/backend/app/services/llm/providers/registry.py @@ -13,12 +13,14 @@ class LLMProvider: OPENAI_NATIVE = "openai-native" + OPENAI = "openai" # Future constants for native providers: # CLAUDE_NATIVE = "claude-native" # GEMINI_NATIVE = "gemini-native" _registry: dict[str, type[BaseProvider]] = { OPENAI_NATIVE: OpenAIProvider, + OPENAI: OpenAIProvider # Future native providers: # CLAUDE_NATIVE: ClaudeProvider, # GEMINI_NATIVE: GeminiProvider, diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 813fd483f..e26bc754a 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1,6 +1,7 @@ import io from typing import Any from unittest.mock import Mock, patch +from uuid import uuid4 import pytest from fastapi.testclient import TestClient @@ -9,7 +10,7 @@ from app.crud.evaluations.batch import build_evaluation_jsonl from app.models import EvaluationDataset, EvaluationRun from app.tests.utils.auth import TestAuthContext -from app.tests.utils.test_data import create_test_evaluation_dataset +from app.tests.utils.test_data import create_test_config, create_test_evaluation_dataset # Helper function to create CSV file-like object @@ -524,15 +525,21 @@ def test_start_batch_evaluation_invalid_dataset_id( self, client: TestClient, user_api_key_header: dict[str, str], - sample_evaluation_config: dict[str, Any], + db: Session, + user_api_key: TestAuthContext, ) -> None: - """Test batch evaluation fails with invalid/non-existent dataset_id.""" + """Test batch evaluation fails with invalid dataset_id.""" + # Create a valid config to use + config = create_test_config(db, project_id=user_api_key.project_id) + + # Try to start evaluation with non-existent dataset_id response = client.post( "/api/v1/evaluations/", json={ "experiment_name": "test_evaluation_run", - "dataset_id": 99999, - "config": sample_evaluation_config, + "dataset_id": 99999, # Non-existent + "config_id": str(config.id), + "config_version": 1, }, headers=user_api_key_header, ) @@ -547,32 +554,27 @@ def test_start_batch_evaluation_invalid_dataset_id( def test_start_batch_evaluation_missing_model( self, client: TestClient, user_api_key_header: dict[str, str] ) -> None: - """Test batch evaluation fails when model is missing from config.""" - # We don't need a real dataset for this test - the validation should happen - # before dataset lookup. Use any dataset_id and expect config validation error - invalid_config = { - "instructions": "You are a helpful assistant", - "temperature": 0.5, - } - + """Test batch evaluation fails with invalid config_id.""" + # Test with a non-existent config_id (random UUID) response = client.post( "/api/v1/evaluations/", json={ - "experiment_name": "test_no_model", - "dataset_id": 1, # Dummy ID, error should come before this is checked - "config": invalid_config, + "experiment_name": "test_no_config", + "dataset_id": 1, # Dummy ID, config validation happens first + "config_id": str(uuid4()), # Non-existent config + "config_version": 1, }, headers=user_api_key_header, ) - # Should fail with either 400 (model missing) or 404 (dataset not found) + # Should fail with either 400 (config not found) or 404 (dataset/config not found) assert response.status_code in [400, 404] response_data = response.json() error_str = response_data.get( "detail", response_data.get("message", str(response_data)) ) - # Should fail with either "model" missing or "dataset not found" (both acceptable) - assert "model" in error_str.lower() or "not found" in error_str.lower() + # Should mention config or not found + assert "config" in error_str.lower() or "not found" in error_str.lower() def test_start_batch_evaluation_without_authentication( self, client, sample_evaluation_config @@ -758,11 +760,15 @@ def test_get_evaluation_run_trace_info_not_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for incomplete evaluation returns error.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project_id) + eval_run = EvaluationRun( run_name="test_pending_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -794,11 +800,15 @@ def test_get_evaluation_run_trace_info_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for completed evaluation returns cached scores.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project_id) + eval_run = EvaluationRun( run_name="test_completed_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, score={ @@ -853,11 +863,13 @@ def test_get_evaluation_run_without_trace_info( create_test_dataset: EvaluationDataset, ) -> None: """Test getting evaluation run without requesting trace info.""" + config = create_test_config(db, project_id=user_api_key.project_id) eval_run = EvaluationRun( run_name="test_simple_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, organization_id=user_api_key.organization_id, @@ -888,11 +900,13 @@ def test_get_evaluation_run_resync_without_trace_info_fails( create_test_dataset: EvaluationDataset, ) -> None: """Test that resync_score=true requires get_trace_info=true.""" + config = create_test_config(db, project_id=user_api_key.project_id) eval_run = EvaluationRun( run_name="test_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, organization_id=user_api_key.organization_id,