diff --git a/AGENTS.md b/AGENTS.md
index a953de3a3f..37aa446b3f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -7,7 +7,22 @@
 
 
 ## Testing Instructions
-- Tests are currently still not working and should not be run 
+
+For comprehensive testing documentation, see [docs/designs/testing/README.md](docs/designs/testing/README.md).
+
+Quick overview:
+- **API Tests**: `cd api && python run-tests.py --api-url <api_url> --auth-key <auth_key> --license oss`
+- **SDK Tests**: `cd sdk && python run-tests.py --api-url <api_url> --auth-key <auth_key> --license oss`
+- **Web Tests**: `cd web/tests && AGENTA_WEB_URL=<web_url> TESTMAIL_NAMESPACE=<email_ns> TESTMAIL_API_KEY=<email_key> pnpm tsx playwright/scripts/run-tests.ts --coverage smoke`
+
+Test documentation covers:
+- Testing principles and philosophy
+- Test boundaries (utils, unit, E2E)
+- Test dimensions (coverage, path, case, lens, speed, license, cost, role, plan)
+- Interface-specific guides (API, SDK, Web, Services)
+- Test structure and organization
+- Fixtures and utilities
+- Running tests locally and in CI 
 
 ## PR instructions
 - If the user provides you with the issue id, title the PR: [issue-id] fix(frontend): <Title> where fix is the type (fix, feat, chore, ci, doc, test.. [we're using better-branch) and frontend is where and it could be API, SDK, frontend, docs, ..
diff --git a/api/ee/tests/pytest/billing_period_test_cases.csv b/api/ee/tests/pytest/e2e/billing_period_test_cases.csv
similarity index 100%
rename from api/ee/tests/pytest/billing_period_test_cases.csv
rename to api/ee/tests/pytest/e2e/billing_period_test_cases.csv
diff --git a/api/ee/tests/pytest/test_billing_period.py b/api/ee/tests/pytest/e2e/test_billing_period.py
similarity index 100%
rename from api/ee/tests/pytest/test_billing_period.py
rename to api/ee/tests/pytest/e2e/test_billing_period.py
diff --git a/api/oss/src/apis/fastapi/auth/router.py b/api/oss/src/apis/fastapi/auth/router.py
index 1a4cf6e876..2751b38f83 100644
--- a/api/oss/src/apis/fastapi/auth/router.py
+++ b/api/oss/src/apis/fastapi/auth/router.py
@@ -166,7 +166,7 @@ async def sso_callback_redirect(
     if not is_ee():
         raise HTTPException(
             status_code=404,
-            detail="SSO/OIDC is only available in Enterprise Edition",
+            detail="SSO/OIDC is only available in EE",
         )
 
     try:
diff --git a/api/oss/src/apis/fastapi/evaluations/router.py b/api/oss/src/apis/fastapi/evaluations/router.py
index e69a26b3dc..a196a66005 100644
--- a/api/oss/src/apis/fastapi/evaluations/router.py
+++ b/api/oss/src/apis/fastapi/evaluations/router.py
@@ -1129,7 +1129,7 @@ async def edit_scenario(
             ):
                 raise FORBIDDEN_EXCEPTION  # type: ignore
 
-        if str(scenario_id) != scenario_edit_request.scenario.id:
+        if str(scenario_id) != str(scenario_edit_request.scenario.id):
             return EvaluationScenarioResponse()
 
         scenario = await self.evaluations_service.edit_scenario(
@@ -1350,7 +1350,7 @@ async def edit_result(
             ):
                 raise FORBIDDEN_EXCEPTION  # type: ignore
 
-        if str(result_id) != result_edit_request.result.id:
+        if str(result_id) != str(result_edit_request.result.id):
             return EvaluationResultResponse()
 
         result = await self.evaluations_service.edit_result(
@@ -1729,7 +1729,7 @@ async def edit_queue(
             ):
                 raise FORBIDDEN_EXCEPTION  # type: ignore
 
-        if str(queue_id) != queue_edit_request.queue.id:
+        if str(queue_id) != str(queue_edit_request.queue.id):
             return EvaluationQueueResponse()
 
         queue = await self.evaluations_service.edit_queue(
@@ -2116,7 +2116,7 @@ async def edit_evaluation(
             ):
                 raise FORBIDDEN_EXCEPTION  # type: ignore
 
-        if str(evaluation_id) != evaluation_edit_request.evaluation.id:
+        if str(evaluation_id) != str(evaluation_edit_request.evaluation.id):
             return SimpleEvaluationResponse()
 
         evaluation_edit = evaluation_edit_request.evaluation
diff --git a/api/oss/src/apis/fastapi/workflows/router.py b/api/oss/src/apis/fastapi/workflows/router.py
index 5e171f2e73..f1ee7d1eec 100644
--- a/api/oss/src/apis/fastapi/workflows/router.py
+++ b/api/oss/src/apis/fastapi/workflows/router.py
@@ -1002,7 +1002,7 @@ async def commit_workflow_revision(
             ):
                 raise FORBIDDEN_EXCEPTION  # type: ignore
 
-        if str(workflow_variant_id) != str(
+        if workflow_variant_id is not None and str(workflow_variant_id) != str(
             workflow_revision_commit_request.workflow_revision.workflow_variant_id
         ):
             return WorkflowRevisionResponse()
diff --git a/api/oss/src/dbs/postgres/blobs/dao.py b/api/oss/src/dbs/postgres/blobs/dao.py
index 0929ae517c..5c1f282ecf 100644
--- a/api/oss/src/dbs/postgres/blobs/dao.py
+++ b/api/oss/src/dbs/postgres/blobs/dao.py
@@ -443,10 +443,11 @@ async def query_blobs(
                     self.BlobDBE.tags.contains(blob_query.tags),  # type: ignore
                 )
 
-            if blob_query.meta:
-                stmt = stmt.filter(
-                    self.BlobDBE.meta.contains(blob_query.meta),  # type: ignore
-                )
+            # meta is JSON (not JSONB) — containment (@>) is not supported
+            # if blob_query.meta:
+            #     stmt = stmt.filter(
+            #         self.BlobDBE.meta.contains(blob_query.meta),
+            #     )
 
             if windowing:
                 stmt = apply_windowing(
diff --git a/api/oss/src/dbs/postgres/evaluations/dao.py b/api/oss/src/dbs/postgres/evaluations/dao.py
index 0b43c010f1..aa6b654867 100644
--- a/api/oss/src/dbs/postgres/evaluations/dao.py
+++ b/api/oss/src/dbs/postgres/evaluations/dao.py
@@ -492,7 +492,7 @@ async def close_run(
                     mode="json",
                 )
 
-            # run_dbe.flags["is_closed"] = True  # type: ignore
+            run_dbe.flags["is_closed"] = True  # type: ignore
             flag_modified(run_dbe, "flags")
 
             run_dbe.updated_at = datetime.now(timezone.utc)  # type: ignore
@@ -537,7 +537,7 @@ async def close_runs(
                         mode="json",
                     )
 
-                # run_dbe.flags["is_closed"] = True  # type: ignore
+                run_dbe.flags["is_closed"] = True  # type: ignore
                 flag_modified(run_dbe, "flags")
 
                 run_dbe.updated_at = datetime.now(timezone.utc)  # type: ignore
@@ -690,10 +690,11 @@ async def query_runs(
                         EvaluationRunDBE.tags.contains(run.tags),
                     )
 
-                if run.meta is not None:
-                    stmt = stmt.filter(
-                        EvaluationRunDBE.meta.contains(run.meta),
-                    )
+                # meta is JSON (not JSONB) — containment (@>) is not supported
+                # if run.meta is not None:
+                #     stmt = stmt.filter(
+                #         EvaluationRunDBE.meta.contains(run.meta),
+                #     )
 
                 if run.status is not None:
                     stmt = stmt.filter(
@@ -1245,10 +1246,11 @@ async def query_scenarios(
                         EvaluationScenarioDBE.tags.contains(scenario.tags),
                     )
 
-                if scenario.meta is not None:
-                    stmt = stmt.filter(
-                        EvaluationScenarioDBE.meta.contains(scenario.meta),
-                    )
+                # meta is JSON (not JSONB) — containment (@>) is not supported
+                # if scenario.meta is not None:
+                #     stmt = stmt.filter(
+                #         EvaluationScenarioDBE.meta.contains(scenario.meta),
+                #     )
 
                 if scenario.status is not None:
                     stmt = stmt.filter(
@@ -1765,10 +1767,11 @@ async def query_results(
                         EvaluationResultDBE.tags.contains(result.tags),
                     )
 
-                if result.meta is not None:
-                    stmt = stmt.filter(
-                        EvaluationResultDBE.meta.contains(result.meta),
-                    )
+                # meta is JSON (not JSONB) — containment (@>) is not supported
+                # if result.meta is not None:
+                #     stmt = stmt.filter(
+                #         EvaluationResultDBE.meta.contains(result.meta),
+                #     )
 
                 if result.status is not None:
                     stmt = stmt.filter(
@@ -2220,10 +2223,11 @@ async def query_metrics(
                         EvaluationMetricsDBE.tags.contains(metric.tags),
                     )
 
-                if metric.meta is not None:
-                    stmt = stmt.filter(
-                        EvaluationMetricsDBE.meta.contains(metric.meta),
-                    )
+                # meta is JSON (not JSONB) — containment (@>) is not supported
+                # if metric.meta is not None:
+                #     stmt = stmt.filter(
+                #         EvaluationMetricsDBE.meta.contains(metric.meta),
+                #     )
 
                 if metric.status is not None:
                     stmt = stmt.filter(
@@ -2679,10 +2683,11 @@ async def query_queues(
                         EvaluationQueueDBE.tags.contains(queue.tags),
                     )
 
-                if queue.meta is not None:
-                    stmt = stmt.filter(
-                        EvaluationQueueDBE.meta.contains(queue.meta),
-                    )
+                # meta is JSON (not JSONB) — containment (@>) is not supported
+                # if queue.meta is not None:
+                #     stmt = stmt.filter(
+                #         EvaluationQueueDBE.meta.contains(queue.meta),
+                #     )
 
                 if queue.name is not None:
                     stmt = stmt.filter(
diff --git a/api/oss/src/dbs/postgres/folders/dao.py b/api/oss/src/dbs/postgres/folders/dao.py
index 6a04ff2f27..725a087d79 100644
--- a/api/oss/src/dbs/postgres/folders/dao.py
+++ b/api/oss/src/dbs/postgres/folders/dao.py
@@ -372,8 +372,9 @@ async def query(
             if folder_query.flags is not None:
                 stmt = stmt.filter(FolderDBE.flags.contains(folder_query.flags))
 
-            if folder_query.meta is not None:
-                stmt = stmt.filter(FolderDBE.meta.contains(folder_query.meta))
+            # meta is JSON (not JSONB) — containment (@>) is not supported
+            # if folder_query.meta is not None:
+            #     stmt = stmt.filter(FolderDBE.meta.contains(folder_query.meta))
 
             result = await session.execute(stmt)
 
diff --git a/api/oss/src/dbs/postgres/git/dao.py b/api/oss/src/dbs/postgres/git/dao.py
index 9f8645530f..037b7adeaa 100644
--- a/api/oss/src/dbs/postgres/git/dao.py
+++ b/api/oss/src/dbs/postgres/git/dao.py
@@ -332,10 +332,11 @@ async def query_artifacts(
                     self.ArtifactDBE.tags.contains(artifact_query.tags)  # type: ignore
                 )
 
-            if artifact_query.meta:
-                stmt = stmt.filter(
-                    self.ArtifactDBE.meta.contains(artifact_query.meta)  # type: ignore
-                )
+            # meta is JSON (not JSONB) — containment (@>) is not supported
+            # if artifact_query.meta:
+            #     stmt = stmt.filter(
+            #         self.ArtifactDBE.meta.contains(artifact_query.meta)
+            #     )
 
             if artifact_query.name:
                 stmt = stmt.filter(
@@ -665,10 +666,11 @@ async def query_variants(
                     self.VariantDBE.tags.contains(variant_query.tags)  # type: ignore
                 )
 
-            if variant_query.meta:
-                stmt = stmt.filter(
-                    self.VariantDBE.meta.contains(variant_query.meta)  # type: ignore
-                )
+            # meta is JSON (not JSONB) — containment (@>) is not supported
+            # if variant_query.meta:
+            #     stmt = stmt.filter(
+            #         self.VariantDBE.meta.contains(variant_query.meta)
+            #     )
 
             if variant_query.name:
                 stmt = stmt.filter(
@@ -877,7 +879,7 @@ async def create_revision(
                 revision.version = await self._get_version(
                     project_id=project_id,
                     variant_id=revision.variant_id,  # type: ignore
-                    created_at=revision.created_at,  # type: ignore
+                    revision_id=revision.id,  # type: ignore
                 )
 
                 await self._set_version(
@@ -918,6 +920,13 @@ async def fetch_revision(
             elif variant_ref:
                 if variant_ref.id:
                     stmt = stmt.filter(self.RevisionDBE.variant_id == variant_ref.id)  # type: ignore
+                elif variant_ref.slug:
+                    stmt = stmt.join(
+                        self.VariantDBE,
+                        self.RevisionDBE.variant_id == self.VariantDBE.id,  # type: ignore
+                    ).filter(
+                        self.VariantDBE.slug == variant_ref.slug,  # type: ignore
+                    )
 
                 if revision_ref and revision_ref.version:
                     stmt = stmt.filter(self.RevisionDBE.version == revision_ref.version)  # type: ignore
@@ -1140,10 +1149,11 @@ async def query_revisions(
                     self.RevisionDBE.tags.contains(revision_query.tags)  # type: ignore
                 )
 
-            if revision_query.meta:
-                stmt = stmt.filter(
-                    self.RevisionDBE.meta.contains(revision_query.meta)  # type: ignore
-                )
+            # meta is JSON (not JSONB) — containment (@>) is not supported
+            # if revision_query.meta:
+            #     stmt = stmt.filter(
+            #         self.RevisionDBE.meta.contains(revision_query.meta)
+            #     )
 
             if revision_query.author:
                 stmt = stmt.filter(
@@ -1271,7 +1281,7 @@ async def commit_revision(
                 revision.version = await self._get_version(
                     project_id=project_id,
                     variant_id=revision.variant_id,  # type: ignore
-                    created_at=revision.created_at,  # type: ignore
+                    revision_id=revision.id,  # type: ignore
                 )
 
                 await self._set_version(
@@ -1394,7 +1404,7 @@ async def _get_version(
         *,
         project_id: UUID,
         variant_id: UUID,
-        created_at: datetime,
+        revision_id: UUID,
     ) -> str:
         async with engine.core_session() as session:
             stmt = (
@@ -1403,7 +1413,7 @@ async def _get_version(
                 .where(
                     self.RevisionDBE.project_id == project_id,  # type: ignore
                     self.RevisionDBE.variant_id == variant_id,  # type: ignore
-                    self.RevisionDBE.created_at < created_at,  # type: ignore
+                    self.RevisionDBE.id < revision_id,  # type: ignore
                 )
             )
 
diff --git a/api/oss/src/services/variants_manager.py b/api/oss/src/services/variants_manager.py
index 246a01df2f..5186895b5d 100644
--- a/api/oss/src/services/variants_manager.py
+++ b/api/oss/src/services/variants_manager.py
@@ -993,9 +993,10 @@ async def fork_config_by_variant_ref(
     if app_variant_revision.data:
         params = app_variant_revision.data.parameters or {}
 
-    # Build compound slug for the forked variant
+    # Build compound slug for the forked variant (always unique)
+    unique_suffix = uuid4().hex[-12:]
     if variant_ref.slug:
-        # Fetch app to construct compound slug: {app_slug}.{variant_name}
+        # Fetch app to construct compound slug: {app_slug}.{variant_name}_{suffix}
         app = await _fetch_app(
             project_id=project_id,
             app_id=app_variant.application_id,
@@ -1003,10 +1004,10 @@ async def fork_config_by_variant_ref(
         if not app:
             log.error(f"App not found for application_id: {app_variant.application_id}")
             return None
-        fork_slug = f"{app.slug}.{variant_ref.slug}"
+        fork_slug = f"{app.slug}.{variant_ref.slug}_{unique_suffix}"
     else:
         # app_variant.slug is already compound; append a unique suffix
-        fork_slug = app_variant.slug + "_" + uuid4().hex[-12:]
+        fork_slug = app_variant.slug + "_" + unique_suffix
 
     variant_slug, variant_version = await _create_variant(
         project_id=project_id,
diff --git a/api/oss/tests/pytest/annotations/__init__.py b/api/oss/tests/pytest/e2e/annotations/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/annotations/__init__.py
rename to api/oss/tests/pytest/e2e/annotations/__init__.py
diff --git a/api/oss/tests/pytest/annotations/test_annotations_basics.py b/api/oss/tests/pytest/e2e/annotations/test_annotations_basics.py
similarity index 100%
rename from api/oss/tests/pytest/annotations/test_annotations_basics.py
rename to api/oss/tests/pytest/e2e/annotations/test_annotations_basics.py
diff --git a/api/oss/tests/pytest/annotations/test_annotations_queries.py b/api/oss/tests/pytest/e2e/annotations/test_annotations_queries.py
similarity index 100%
rename from api/oss/tests/pytest/annotations/test_annotations_queries.py
rename to api/oss/tests/pytest/e2e/annotations/test_annotations_queries.py
diff --git a/api/oss/tests/pytest/evaluations/__init__.py b/api/oss/tests/pytest/e2e/evaluations/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/evaluations/__init__.py
rename to api/oss/tests/pytest/e2e/evaluations/__init__.py
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_metrics_basics.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_basics.py
similarity index 92%
rename from api/oss/tests/pytest/evaluations/test_evaluation_metrics_basics.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_basics.py
index 900608f0fa..18aa496899 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_metrics_basics.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_basics.py
@@ -2,7 +2,7 @@ class TestEvaluationMetricsBasics:
     def test_create_evaluation_metrics(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_evaluation_metrics_basics"},
         ]
 
         response = authed_api(
@@ -46,7 +46,7 @@ def test_create_evaluation_metrics(self, authed_api):
     def test_edit_evaluation_metrics(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_edit_evaluation_metrics"},
         ]
 
         response = authed_api(
@@ -108,7 +108,7 @@ def test_edit_evaluation_metrics(self, authed_api):
     def test_delete_evaluation_metrics(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_delete_evaluation_metrics"},
         ]
 
         response = authed_api(
@@ -176,7 +176,7 @@ def test_delete_evaluation_metrics(self, authed_api):
     def test_fetch_evaluation_metric(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_fetch_evaluation_metric"},
         ]
 
         response = authed_api(
@@ -214,28 +214,35 @@ def test_fetch_evaluation_metric(self, authed_api):
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
+        # NOTE: GET /metrics/{id} does not exist, use POST /metrics/query
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/metrics/{metric['id']}",
+            "POST",
+            "/preview/evaluations/metrics/query",
+            json={
+                "metrics": {
+                    "run_id": run_id,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        print(response)
-        assert response["count"] == 1
-        assert response["metric"]["id"] == metric["id"]
-        assert response["metric"]["data"]["integer_metric"] == 42
-        assert response["metric"]["data"]["float_metric"] == 3.14
-        assert response["metric"]["data"]["string_metric"] == "test"
-        assert response["metric"]["data"]["boolean_metric"] is True
+        assert response["count"] >= 1
+        metric_ids = [m["id"] for m in response["metrics"]]
+        assert metric["id"] in metric_ids
+        matched = [m for m in response["metrics"] if m["id"] == metric["id"]][0]
+        assert matched["data"]["integer_metric"] == 42
+        assert matched["data"]["float_metric"] == 3.14
+        assert matched["data"]["string_metric"] == "test"
+        assert matched["data"]["boolean_metric"] is True
         # ----------------------------------------------------------------------
 
     def test_edit_evaluation_metric(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_edit_evaluation_metric"},
         ]
 
         response = authed_api(
@@ -298,7 +305,7 @@ def test_edit_evaluation_metric(self, authed_api):
     def test_delete_evaluation_metric(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_evaluation_steps_basics"},
+            {"name": "test_delete_evaluation_metric"},
         ]
 
         response = authed_api(
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_metrics_queries.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_queries.py
similarity index 81%
rename from api/oss/tests/pytest/evaluations/test_evaluation_metrics_queries.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_queries.py
index 695325d49f..1cc3cfa0a1 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_metrics_queries.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_metrics_queries.py
@@ -27,42 +27,57 @@ def mock_data(authed_api):
         "meta2": "value2",
     }
 
-    metrics = [
-        {
-            "run_id": runs[0]["id"],
-            "status": "success",
-            "data": {
-                "integer_metric": 42,
-                "float_metric": 3.14,
-                "string_metric": "test",
-                "boolean_metric": True,
-            },
-            "tags": tags,
-            "meta": meta,
-        },
-        {
-            "run_id": runs[1]["id"],
-            "status": "failure",
-            "data": {
-                "integer_metric": 42,
-                "float_metric": 3.14,
-                "string_metric": "test",
-                "boolean_metric": True,
-            },
+    response = authed_api(
+        "POST",
+        "/preview/evaluations/metrics/",
+        json={
+            "metrics": [
+                {
+                    "run_id": runs[0]["id"],
+                    "status": "success",
+                    "data": {
+                        "integer_metric": 42,
+                        "float_metric": 3.14,
+                        "string_metric": "test",
+                        "boolean_metric": True,
+                    },
+                    "tags": tags,
+                    "meta": meta,
+                },
+            ]
         },
-    ]
+    )
+
+    assert response.status_code == 200
+    assert response.json()["count"] == 1
+
+    metric_1 = response.json()["metrics"][0]
 
     response = authed_api(
         "POST",
         "/preview/evaluations/metrics/",
-        json={"metrics": metrics},
+        json={
+            "metrics": [
+                {
+                    "run_id": runs[1]["id"],
+                    "status": "failure",
+                    "data": {
+                        "integer_metric": 42,
+                        "float_metric": 3.14,
+                        "string_metric": "test",
+                        "boolean_metric": True,
+                    },
+                },
+            ]
+        },
     )
 
     assert response.status_code == 200
-    response = response.json()
-    assert response["count"] == 2
+    assert response.json()["count"] == 1
 
-    metrics = response["metrics"]
+    metric_2 = response.json()["metrics"][0]
+
+    metrics = [metric_1, metric_2]
     # --------------------------------------------------------------------------
 
     _mock_data = {
@@ -85,7 +100,7 @@ def test_query_metrics_by_ids(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
                     "ids": metrics_ids,
                 }
             },
@@ -103,6 +118,7 @@ def test_query_metrics_by_tags(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         metrics = mock_data["metrics"]
         metrics_ids = [metric["id"] for metric in metrics]
+        run_ids = [r["id"] for r in mock_data["runs"]]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
@@ -110,7 +126,8 @@ def test_query_metrics_by_tags(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
+                    "run_ids": run_ids,
                     "tags": {
                         "tags1": "value1",
                         "tags2": "value2",
@@ -127,37 +144,10 @@ def test_query_metrics_by_tags(self, authed_api, mock_data):
         assert all(metric["id"] in metrics_ids for metric in response["metrics"])
         # ----------------------------------------------------------------------
 
-    def test_query_metrics_by_meta(self, authed_api, mock_data):
-        # ARRANGE --------------------------------------------------------------
-        metrics = mock_data["metrics"]
-        metrics_ids = [metric["id"] for metric in metrics]
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/metrics/query",
-            json={
-                "metric": {
-                    "meta": {
-                        "meta1": "value1",
-                        "meta2": "value2",
-                    },
-                }
-            },
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert all(metric["id"] in metrics_ids for metric in response["metrics"])
-        # ----------------------------------------------------------------------
-
     def test_query_metrics_by_status(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         metrics = mock_data["metrics"]
+        run_ids = [r["id"] for r in mock_data["runs"]]
         metrics_ids = [
             metric["id"] for metric in metrics if metric["status"] == "success"
         ]
@@ -168,7 +158,8 @@ def test_query_metrics_by_status(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
+                    "run_ids": run_ids,
                     "status": "success",
                 }
             },
@@ -185,6 +176,7 @@ def test_query_metrics_by_status(self, authed_api, mock_data):
     def test_query_metrics_by_statuses(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         metrics = mock_data["metrics"]
+        run_ids = [r["id"] for r in mock_data["runs"]]
         metrics_ids = [
             metric["id"]
             for metric in metrics
@@ -197,7 +189,8 @@ def test_query_metrics_by_statuses(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
+                    "run_ids": run_ids,
                     "statuses": ["success", "failure"],
                 }
             },
@@ -223,7 +216,7 @@ def test_query_metrics_by_run_id(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
                     "run_id": run_id,
                 }
             },
@@ -251,7 +244,7 @@ def test_query_metrics_by_run_ids(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
                     "run_ids": run_ids,
                 }
             },
@@ -287,24 +280,25 @@ def test_query_metrics_no_timestamps_filters(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
+        # timestamps: False => metrics WHERE timestamp IS NULL (run-level)
         run_level_response = authed_api(
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
                     "run_id": run_id,
-                    "scenario_ids": True,
+                    "timestamps": False,
                 }
             },
         )
+        # timestamps: True => metrics WHERE timestamp IS NOT NULL (temporal)
         temporal_response = authed_api(
             "POST",
             "/preview/evaluations/metrics/query",
             json={
-                "metric": {
+                "metrics": {
                     "run_id": run_id,
-                    "scenario_ids": True,
-                    "timestamps": False,
+                    "timestamps": True,
                 }
             },
         )
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_runs_basics.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_basics.py
similarity index 74%
rename from api/oss/tests/pytest/evaluations/test_evaluation_runs_basics.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_basics.py
index 75a004e236..f56050e4a1 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_runs_basics.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_basics.py
@@ -5,33 +5,26 @@ class TestEvaluationRunsBasics:
     def test_create_evaluation_runs(self, authed_api):
         # ACT ------------------------------------------------------------------
         testset_id = str(uuid4())
-        testset_variant_id = str(uuid4())
-        testset_revision_id = str(uuid4())
 
         application_id = str(uuid4())
-        application_variant_id = str(uuid4())
-        application_revision_id = str(uuid4())
 
         evaluator_id = str(uuid4())
-        evaluator_variant_id = str(uuid4())
-        evaluator_revision_id = str(uuid4())
 
         steps = [
             {
                 "key": "input",
-                "is_testcase": True,
+                "type": "input",
+                "origin": "custom",
                 "references": {
                     "testset": {"id": testset_id},
-                    "testset_variant": {"id": testset_variant_id},
-                    "testset_revision": {"id": testset_revision_id},
                 },
             },
             {
                 "key": "invocation",
+                "type": "invocation",
+                "origin": "auto",
                 "references": {
                     "application": {"id": application_id},
-                    "application_variant": {"id": application_variant_id},
-                    "application_revision": {"id": application_revision_id},
                 },
                 "inputs": [
                     {"key": "input"},
@@ -39,10 +32,10 @@ def test_create_evaluation_runs(self, authed_api):
             },
             {
                 "key": "annotation",
+                "type": "annotation",
+                "origin": "auto",
                 "references": {
                     "evaluator": {"id": evaluator_id},
-                    "evaluator_variant": {"id": evaluator_variant_id},
-                    "evaluator_revision": {"id": evaluator_revision_id},
                 },
                 "inputs": [
                     {"key": "input"},
@@ -53,33 +46,27 @@ def test_create_evaluation_runs(self, authed_api):
 
         mappings = [
             {
-                "kind": "input",
-                "name": "Country",
+                "column": {"kind": "input", "name": "Country"},
                 "step": {"key": "input", "path": "country"},
             },
             {
-                "kind": "ground_truth",
-                "name": "Capital (expected)",
+                "column": {"kind": "ground_truth", "name": "Capital (expected)"},
                 "step": {"key": "input", "path": "correct_answer"},
             },
             {
-                "kind": "application",
-                "name": "Capital (actual)",
+                "column": {"kind": "application", "name": "Capital (actual)"},
                 "step": {"key": "invocation", "path": "data.outputs.answer"},
             },
             {
-                "kind": "evaluator",
-                "name": "Score",
+                "column": {"kind": "evaluator", "name": "Score"},
                 "step": {"key": "annotation", "path": "data.outputs.score"},
             },
             {
-                "kind": "evaluator",
-                "name": "Confidence",
+                "column": {"kind": "evaluator", "name": "Confidence"},
                 "step": {"key": "annotation", "path": "data.outputs.confidence"},
             },
             {
-                "kind": "evaluator",
-                "name": "Explanation",
+                "column": {"kind": "evaluator", "name": "Explanation"},
                 "step": {"key": "annotation", "path": "data.outputs.explanation"},
             },
         ]
@@ -110,7 +97,7 @@ def test_create_evaluation_runs(self, authed_api):
         response = authed_api(
             "POST",
             "/preview/evaluations/runs/",
-            json={"runs": runs},
+            json={"jit": False, "runs": runs},
         )
         # ----------------------------------------------------------------------
 
@@ -118,11 +105,18 @@ def test_create_evaluation_runs(self, authed_api):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["runs"][0]["tags"] == tags
-        assert response["runs"][0]["meta"] == meta
-        assert response["runs"][0]["status"] == "pending"
-        assert response["runs"][0]["data"]["steps"] == steps
-        assert response["runs"][0]["data"]["mappings"] == mappings
+        run = response["runs"][0]
+        assert run["tags"] == tags
+        assert run["meta"] == meta
+        assert run["status"] == "pending"
+        assert len(run["data"]["steps"]) == 3
+        assert run["data"]["steps"][0]["key"] == "input"
+        assert run["data"]["steps"][0]["type"] == "input"
+        assert run["data"]["steps"][1]["key"] == "invocation"
+        assert run["data"]["steps"][1]["type"] == "invocation"
+        assert run["data"]["steps"][2]["key"] == "annotation"
+        assert run["data"]["steps"][2]["type"] == "annotation"
+        assert len(run["data"]["mappings"]) == 6
         # ----------------------------------------------------------------------
 
     def test_delete_evaluation_runs(self, authed_api):
@@ -182,11 +176,11 @@ def test_delete_evaluation_runs(self, authed_api):
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_archive_evaluation_runs(self, authed_api):
+    def test_close_evaluation_runs(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_archive_evaluation_runs_1"},
-            {"name": "test_archive_evaluation_runs_2"},
+            {"name": "test_close_evaluation_runs_1"},
+            {"name": "test_close_evaluation_runs_2"},
         ]
 
         response = authed_api(
@@ -199,8 +193,8 @@ def test_archive_evaluation_runs(self, authed_api):
         response = response.json()
         assert response["count"] == 2
         runs = response["runs"]
-        assert runs[0]["name"] == "test_archive_evaluation_runs_1"
-        assert runs[1]["name"] == "test_archive_evaluation_runs_2"
+        assert runs[0]["name"] == "test_close_evaluation_runs_1"
+        assert runs[1]["name"] == "test_close_evaluation_runs_2"
         run_id_1 = runs[0]["id"]
         run_id_2 = runs[1]["id"]
         # ----------------------------------------------------------------------
@@ -208,7 +202,7 @@ def test_archive_evaluation_runs(self, authed_api):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
-            "/preview/evaluations/runs/archive",
+            "/preview/evaluations/runs/close",
             json={"run_ids": [run_id_1, run_id_2]},
         )
         # ----------------------------------------------------------------------
@@ -221,11 +215,11 @@ def test_archive_evaluation_runs(self, authed_api):
         assert response["runs"][1]["id"] == run_id_2
         # ----------------------------------------------------------------------
 
-    def test_unarchive_evaluation_runs(self, authed_api):
+    def test_open_evaluation_runs(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_unarchive_evaluation_runs_1"},
-            {"name": "test_unarchive_evaluation_runs_2"},
+            {"name": "test_open_evaluation_runs_1"},
+            {"name": "test_open_evaluation_runs_2"},
         ]
 
         response = authed_api(
@@ -238,14 +232,12 @@ def test_unarchive_evaluation_runs(self, authed_api):
         response = response.json()
         assert response["count"] == 2
         runs = response["runs"]
-        assert runs[0]["name"] == "test_unarchive_evaluation_runs_1"
-        assert runs[1]["name"] == "test_unarchive_evaluation_runs_2"
         run_id_1 = runs[0]["id"]
         run_id_2 = runs[1]["id"]
 
         response = authed_api(
             "POST",
-            "/preview/evaluations/runs/archive",
+            "/preview/evaluations/runs/close",
             json={"run_ids": [run_id_1, run_id_2]},
         )
 
@@ -257,7 +249,7 @@ def test_unarchive_evaluation_runs(self, authed_api):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
-            "/preview/evaluations/runs/unarchive",
+            "/preview/evaluations/runs/open",
             json={"run_ids": [run_id_1, run_id_2]},
         )
         # ----------------------------------------------------------------------
@@ -270,47 +262,6 @@ def test_unarchive_evaluation_runs(self, authed_api):
         assert response["runs"][1]["id"] == run_id_2
         # ----------------------------------------------------------------------
 
-    def test_close_evaluation_runs(self, authed_api):
-        # ARRANGE --------------------------------------------------------------
-        runs = [
-            {"name": "test_close_evaluation_runs_1"},
-            {"name": "test_close_evaluation_runs_2"},
-        ]
-
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/runs/",
-            json={"runs": runs},
-        )
-
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 2
-        runs = response["runs"]
-        assert runs[0]["name"] == "test_close_evaluation_runs_1"
-        assert runs[1]["name"] == "test_close_evaluation_runs_2"
-        run_id_1 = runs[0]["id"]
-        run_id_2 = runs[1]["id"]
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/runs/close",
-            json={"run_ids": [run_id_1, run_id_2]},
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 2
-        assert response["runs"][0]["id"] == run_id_1
-        assert response["runs"][1]["id"] == run_id_2
-        assert response["runs"][0]["flags"] == {"is_closed": True}
-        assert response["runs"][1]["flags"] == {"is_closed": True}
-        # ----------------------------------------------------------------------
-
     def test_fetch_evaluation_run(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
@@ -452,10 +403,10 @@ def test_delete_evaluation_run(self, authed_api):
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_archive_evaluation_run(self, authed_api):
+    def test_close_evaluation_run(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_archive_evaluation_run"},
+            {"name": "test_close_evaluation_run"},
         ]
 
         response = authed_api(
@@ -472,7 +423,7 @@ def test_archive_evaluation_run(self, authed_api):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
-            f"/preview/evaluations/runs/{run_id}/archive",
+            f"/preview/evaluations/runs/{run_id}/close",
         )
         # ----------------------------------------------------------------------
 
@@ -483,10 +434,10 @@ def test_archive_evaluation_run(self, authed_api):
         assert response["run"]["id"] == run_id
         # ----------------------------------------------------------------------
 
-    def test_unarchive_evaluation_run(self, authed_api):
+    def test_open_evaluation_run(self, authed_api):
         # ARRANGE --------------------------------------------------------------
         runs = [
-            {"name": "test_unarchive_evaluation_run"},
+            {"name": "test_open_evaluation_run"},
         ]
 
         response = authed_api(
@@ -501,50 +452,16 @@ def test_unarchive_evaluation_run(self, authed_api):
 
         response = authed_api(
             "POST",
-            f"/preview/evaluations/runs/{run_id}/archive",
-        )
-
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["run"]["id"] == run_id
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            f"/preview/evaluations/runs/{run_id}/unarchive",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["run"]["id"] == run_id
-        # ----------------------------------------------------------------------
-
-    def test_close_evaluation_run(self, authed_api):
-        # ARRANGE --------------------------------------------------------------
-        runs = [
-            {"name": "test_close_evaluation_run"},
-        ]
-
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/runs/",
-            json={"runs": runs},
+            f"/preview/evaluations/runs/{run_id}/close",
         )
 
         assert response.status_code == 200
-
-        run_id = response.json()["runs"][0]["id"]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
-            f"/preview/evaluations/runs/{run_id}/close",
+            f"/preview/evaluations/runs/{run_id}/open",
         )
         # ----------------------------------------------------------------------
 
@@ -553,5 +470,4 @@ def test_close_evaluation_run(self, authed_api):
         response = response.json()
         assert response["count"] == 1
         assert response["run"]["id"] == run_id
-        assert response["run"]["flags"] == {"is_closed": True}
         # ----------------------------------------------------------------------
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_runs_queries.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_queries.py
similarity index 66%
rename from api/oss/tests/pytest/evaluations/test_evaluation_runs_queries.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_queries.py
index d556f0f3ad..0a657c7dd4 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_runs_queries.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_runs_queries.py
@@ -1,5 +1,4 @@
-from json import dumps
-from urllib.parse import quote
+from uuid import uuid4
 
 import pytest
 
@@ -7,14 +6,18 @@
 @pytest.fixture(scope="class")
 def mock_data(authed_api):
     # ARRANGE ------------------------------------------------------------------
+    unique_marker = uuid4().hex[:8]
+
     tags = {
         "tags1": "value1",
         "tags2": "value2",
+        "_marker": unique_marker,
     }
 
     meta = {
         "meta1": "value1",
         "meta2": "value2",
+        "_marker": unique_marker,
     }
 
     run = {
@@ -38,11 +41,13 @@ def mock_data(authed_api):
     tags = {
         "tags1": "value2",
         "tags2": "value3",
+        "_marker": unique_marker,
     }
 
     meta = {
         "meta1": "value2",
         "meta2": "value3",
+        "_marker": unique_marker,
     }
 
     run = {
@@ -66,11 +71,13 @@ def mock_data(authed_api):
     tags = {
         "tags1": "value3",
         "tags2": "value1",
+        "_marker": unique_marker,
     }
 
     meta = {
         "meta1": "value3",
         "meta2": "value1",
+        "_marker": unique_marker,
     }
 
     run = {
@@ -92,7 +99,7 @@ def mock_data(authed_api):
 
     response = authed_api(
         "POST",
-        f"/preview/evaluations/runs/{run_3['id']}/archive",
+        f"/preview/evaluations/runs/{run_3['id']}/close",
     )
 
     assert response.status_code == 200
@@ -100,31 +107,50 @@ def mock_data(authed_api):
     # --------------------------------------------------------------------------
     _mock_data = {
         "runs": [run_1, run_2, run_3],
+        "_marker": unique_marker,
     }
 
     return _mock_data
 
 
 class TestEvaluationRunsQueries:
-    def test_query_evaluations_runs_non_archived(self, authed_api, mock_data):
+    def test_query_evaluations_runs_by_marker(self, authed_api, mock_data):
+        marker = mock_data["_marker"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/runs/",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 2
+        assert response["count"] == 3
+        run_ids = [r["id"] for r in response["runs"]]
+        assert mock_data["runs"][0]["id"] in run_ids
+        assert mock_data["runs"][1]["id"] in run_ids
+        assert mock_data["runs"][2]["id"] in run_ids
         # ----------------------------------------------------------------------
 
-    def test_query_evaluations_runs_include_archived(self, authed_api, mock_data):
+    def test_query_evaluations_runs_by_ids(self, authed_api, mock_data):
+        run_ids = [r["id"] for r in mock_data["runs"]]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/runs/?include_archived=true",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -135,40 +161,18 @@ def test_query_evaluations_runs_include_archived(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_evaluations_runs_by_flags(self, authed_api, mock_data):
-        # ACT ------------------------------------------------------------------
-        flags = {
-            "is_closed": True,
-        }
-
-        flags = quote(dumps(flags))
-
-        response = authed_api(
-            "GET",
-            f"/preview/evaluations/runs/?flags={flags}&include_archived=true",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["runs"][0]["tags"] == {
-            "tags1": "value3",
-            "tags2": "value1",
-        }
-        # ----------------------------------------------------------------------
+        marker = mock_data["_marker"]
 
-    def test_query_evaluations_runs_by_tags(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
-        tags = {
-            "tags1": "value1",
-            "tags2": "value2",
-        }
-        tags = quote(dumps(tags))
-
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/runs/?tags={tags}",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "flags": {"is_closed": True},
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -176,45 +180,25 @@ def test_query_evaluations_runs_by_tags(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["runs"][0]["tags"] == {
-            "tags1": "value1",
-            "tags2": "value2",
-        }
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        tags = {
-            "tags1": "value2",
-            "tags2": "value3",
-        }
-        tags = quote(dumps(tags))
-        response = authed_api(
-            "GET",
-            f"/preview/evaluations/runs/?tags={tags}",
-        )
+        assert response["runs"][0]["id"] == mock_data["runs"][2]["id"]
         # ----------------------------------------------------------------------
 
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["runs"][0]["tags"] == {
-            "tags1": "value2",
-            "tags2": "value3",
-        }
-        # ----------------------------------------------------------------------
+    def test_query_evaluations_runs_by_tags(self, authed_api, mock_data):
+        marker = mock_data["_marker"]
 
-    def test_query_evaluations_runs_by_meta(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
-        meta = {
-            "meta1": "value1",
-            "meta2": "value2",
-        }
-        meta = quote(dumps(meta))
-
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/runs/?meta={meta}",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "tags": {
+                        "tags1": "value1",
+                        "tags2": "value2",
+                        "_marker": marker,
+                    },
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -222,21 +206,22 @@ def test_query_evaluations_runs_by_meta(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["runs"][0]["meta"] == {
-            "meta1": "value1",
-            "meta2": "value2",
-        }
+        assert response["runs"][0]["id"] == mock_data["runs"][0]["id"]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        meta = {
-            "meta1": "value2",
-            "meta2": "value3",
-        }
-        meta = quote(dumps(meta))
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/runs/?meta={meta}",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "tags": {
+                        "tags1": "value2",
+                        "tags2": "value3",
+                        "_marker": marker,
+                    },
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -244,17 +229,22 @@ def test_query_evaluations_runs_by_meta(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["runs"][0]["meta"] == {
-            "meta1": "value2",
-            "meta2": "value3",
-        }
+        assert response["runs"][0]["id"] == mock_data["runs"][1]["id"]
         # ----------------------------------------------------------------------
 
     def test_query_evaluations_runs_by_status(self, authed_api, mock_data):
+        marker = mock_data["_marker"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/runs/?status=success",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "status": "success",
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -267,8 +257,14 @@ def test_query_evaluations_runs_by_status(self, authed_api, mock_data):
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/runs/?status=pending",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "status": "pending",
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -281,8 +277,14 @@ def test_query_evaluations_runs_by_status(self, authed_api, mock_data):
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/runs/?status=failure&include_archived=true",
+            "POST",
+            "/preview/evaluations/runs/query",
+            json={
+                "run": {
+                    "status": "failure",
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_scenarios_basics.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_scenarios_basics.py
similarity index 100%
rename from api/oss/tests/pytest/evaluations/test_evaluation_scenarios_basics.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_scenarios_basics.py
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_scenarios_queries.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_scenarios_queries.py
similarity index 79%
rename from api/oss/tests/pytest/evaluations/test_evaluation_scenarios_queries.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_scenarios_queries.py
index 9c50ebc138..a0f3cbcd3f 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_scenarios_queries.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_scenarios_queries.py
@@ -1,6 +1,3 @@
-from json import dumps
-from urllib.parse import quote
-
 import pytest
 
 
@@ -149,10 +146,17 @@ def mock_data(authed_api):
 
 class TestEvaluationScenariosQueries:
     def test_query_evaluation_scenarios_all(self, authed_api, mock_data):
+        run_ids = [r["id"] for r in mock_data["runs"]]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/scenarios/",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "run_ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -164,33 +168,18 @@ def test_query_evaluation_scenarios_all(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_evaluation_scenarios_by_tags(self, authed_api, mock_data):
-        # ARRANGE ---------------------------------------------------------------
-        tags = {"tags1": "value1"}
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "GET",
-            f"/preview/evaluations/scenarios/?tags={quote(dumps(tags))}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert len(response["scenarios"]) == 1
-        # ----------------------------------------------------------------------
-
-    def test_query_evaluation_scenarios_by_meta(self, authed_api, mock_data):
-        # ARRANGE ---------------------------------------------------------------
-        meta = {"meta1": "value1"}
-        # ----------------------------------------------------------------------
+        run_ids = [r["id"] for r in mock_data["runs"]]
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/scenarios/?meta={quote(dumps(meta))}",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "tags": {"tags1": "value1"},
+                    "run_ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -202,14 +191,16 @@ def test_query_evaluation_scenarios_by_meta(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_evaluation_scenarios_by_run_ids(self, authed_api, mock_data):
-        # ARRANGE ---------------------------------------------------------------
-        run_id = mock_data["runs"][0]["id"]
-        # ----------------------------------------------------------------------
-
         # ACT ------------------------------------------------------------------
+        run_id = mock_data["runs"][0]["id"]
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/scenarios/?run_ids={run_id}",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "run_ids": [run_id],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -223,8 +214,13 @@ def test_query_evaluation_scenarios_by_run_ids(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         run_id = mock_data["runs"][1]["id"]
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/scenarios/?run_ids={run_id}",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "run_ids": [run_id],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -239,8 +235,13 @@ def test_query_evaluation_scenarios_by_run_ids(self, authed_api, mock_data):
         run_1_id = mock_data["runs"][0]["id"]
         run_2_id = mock_data["runs"][1]["id"]
         response = authed_api(
-            "GET",
-            f"/preview/evaluations/scenarios/?run_ids={run_1_id}&run_ids={run_2_id}",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "run_ids": [run_1_id, run_2_id],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -251,10 +252,18 @@ def test_query_evaluation_scenarios_by_run_ids(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_evaluation_scenarios_by_status(self, authed_api, mock_data):
+        run_ids = [r["id"] for r in mock_data["runs"]]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/scenarios/?status=success",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "status": "success",
+                    "run_ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -266,8 +275,14 @@ def test_query_evaluation_scenarios_by_status(self, authed_api, mock_data):
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/scenarios/?status=pending",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "status": "pending",
+                    "run_ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -279,8 +294,14 @@ def test_query_evaluation_scenarios_by_status(self, authed_api, mock_data):
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/scenarios/?status=running",
+            "POST",
+            "/preview/evaluations/scenarios/query",
+            json={
+                "scenario": {
+                    "status": "running",
+                    "run_ids": run_ids,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_steps_basics.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_basics.py
similarity index 71%
rename from api/oss/tests/pytest/evaluations/test_evaluation_steps_basics.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_basics.py
index c571409dc8..6b6b0a8ff9 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_steps_basics.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_basics.py
@@ -1,5 +1,3 @@
-from uuid import uuid4
-
 import pytest
 
 
@@ -46,20 +44,18 @@ def mock_data(authed_api):
 
 
 class TestEvaluationResultsBasics:
-    def test_create_evaluation_steps(self, authed_api, mock_data):
+    def test_create_evaluation_results(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
         scenario_id = mock_data["scenarios"][0]["id"]
 
-        key = "input"
-        repeat_id = str(uuid4())
-        retry_id = str(uuid4())
+        step_key = "input"
+        repeat_idx = 0
 
-        steps = [
+        results = [
             {
-                "key": "input",
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key,
+                "repeat_idx": repeat_idx,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -70,7 +66,7 @@ def test_create_evaluation_steps(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
         # ----------------------------------------------------------------------
 
@@ -78,43 +74,34 @@ def test_create_evaluation_steps(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["steps"][0]["key"] == key
-        assert response["steps"][0]["repeat_id"] == repeat_id
-        assert response["steps"][0]["retry_id"] == retry_id
-        assert response["steps"][0]["scenario_id"] == scenario_id
-        assert response["steps"][0]["run_id"] == run_id
+        assert response["results"][0]["step_key"] == step_key
+        assert response["results"][0]["repeat_idx"] == repeat_idx
+        assert response["results"][0]["scenario_id"] == scenario_id
+        assert response["results"][0]["run_id"] == run_id
         # ----------------------------------------------------------------------
 
-    def test_fetch_evaluation_steps(self, authed_api, mock_data):
+    def test_fetch_evaluation_results(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
         scenario_id = mock_data["scenarios"][1]["id"]
 
-        key_1 = "input"
-        key_2 = "invocation"
-        key_3 = "annotation"
-        repeat_id = str(uuid4())
-        retry_id = str(uuid4())
+        step_key_1 = "input"
+        step_key_2 = "invocation"
+        step_key_3 = "annotation"
 
-        steps = [
+        results = [
             {
-                "key": key_1,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_1,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
             {
-                "key": key_2,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_2,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
             {
-                "key": key_3,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_3,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -123,7 +110,7 @@ def test_fetch_evaluation_steps(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
@@ -133,9 +120,13 @@ def test_fetch_evaluation_steps(self, authed_api, mock_data):
 
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/evaluations/results/",
-            params={"scenario_id": scenario_id},
+            "POST",
+            "/preview/evaluations/results/query",
+            json={
+                "result": {
+                    "scenario_id": scenario_id,
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -143,41 +134,37 @@ def test_fetch_evaluation_steps(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 3
-        assert response["steps"][0]["key"] == key_1
-        assert response["steps"][1]["key"] == key_2
-        assert response["steps"][2]["key"] == key_3
+        step_keys = [r["step_key"] for r in response["results"]]
+        assert step_key_1 in step_keys
+        assert step_key_2 in step_keys
+        assert step_key_3 in step_keys
         # ----------------------------------------------------------------------
 
-    def test_edit_evaluation_steps(self, authed_api, mock_data):
+    def test_edit_evaluation_results(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
         scenario_id = mock_data["scenarios"][0]["id"]
 
-        key_1 = "input"
-        key_2 = "invocation"
-        key_3 = "annotation"
-        repeat_id = str(uuid4())
-        retry_id = str(uuid4())
+        step_key_1 = "input"
+        step_key_2 = "invocation"
+        step_key_3 = "annotation"
 
-        steps = [
+        results = [
             {
-                "key": key_1,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_1,
+                "repeat_idx": 1,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
             {
-                "key": key_2,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_2,
+                "repeat_idx": 1,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
             {
-                "key": key_3,
-                "repeat_id": repeat_id,
-                "retry_id": retry_id,
+                "step_key": step_key_3,
+                "repeat_idx": 1,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -186,62 +173,55 @@ def test_edit_evaluation_steps(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 3
-        assert response["steps"][0]["key"] == key_1
-        assert response["steps"][1]["key"] == key_2
-        assert response["steps"][2]["key"] == key_3
 
-        steps = response["steps"]
-        result_ids = [step["id"] for step in steps]
+        results = response["results"]
+        result_ids = [r["id"] for r in results]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        steps[0]["status"] = "success"
-        steps[1]["status"] = "failure"
-        steps[2]["status"] = "cancelled"
+        results[0]["status"] = "success"
+        results[1]["status"] = "failure"
+        results[2]["status"] = "cancelled"
 
         response = authed_api(
             "PATCH",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
+        # ----------------------------------------------------------------------
 
+        # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 3
-        assert response["steps"][0]["id"] == result_ids[0]
-        assert response["steps"][0]["status"] == "success"
-        assert response["steps"][1]["id"] == result_ids[1]
-        assert response["steps"][1]["status"] == "failure"
-        assert response["steps"][2]["id"] == result_ids[2]
-        assert response["steps"][2]["status"] == "cancelled"
+        patched = {r["id"]: r for r in response["results"]}
+        assert patched[result_ids[0]]["status"] == "success"
+        assert patched[result_ids[1]]["status"] == "failure"
+        assert patched[result_ids[2]]["status"] == "cancelled"
         # ----------------------------------------------------------------------
 
-        # ASSERT ---------------------------------------------------------------
-
-        # ----------------------------------------------------------------------
-
-    def test_delete_evaluation_steps(self, authed_api, mock_data):
+    def test_delete_evaluation_results(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
+        authed_api("POST", f"/preview/evaluations/runs/{run_id}/open")
         scenario_id = mock_data["scenarios"][0]["id"]
 
-        key_1 = "input"
-        key_2 = "invocation"
-
-        steps = [
+        results = [
             {
-                "key": key_1,
+                "step_key": "input",
+                "repeat_idx": 2,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
             {
-                "key": key_2,
+                "step_key": "invocation",
+                "repeat_idx": 2,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -250,14 +230,14 @@ def test_delete_evaluation_steps(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 2
 
-        result_ids = [step["id"] for step in response["steps"]]
+        result_ids = [r["id"] for r in response["results"]]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
@@ -289,16 +269,15 @@ def test_delete_evaluation_steps(self, authed_api, mock_data):
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_fetch_evaluation_step(self, authed_api, mock_data):
+    def test_fetch_evaluation_result(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
+        authed_api("POST", f"/preview/evaluations/runs/{run_id}/open")
         scenario_id = mock_data["scenarios"][2]["id"]
 
-        key_1 = "input"
-
-        steps = [
+        results = [
             {
-                "key": key_1,
+                "step_key": "input",
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -307,14 +286,14 @@ def test_fetch_evaluation_step(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
 
-        result_id = response["steps"][0]["id"]
+        result_id = response["results"][0]["id"]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
@@ -328,19 +307,19 @@ def test_fetch_evaluation_step(self, authed_api, mock_data):
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["step"]["id"] == result_id
+        assert response["result"]["id"] == result_id
         # ----------------------------------------------------------------------
 
-    def test_edit_evaluation_step(self, authed_api, mock_data):
+    def test_edit_evaluation_result(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
+        authed_api("POST", f"/preview/evaluations/runs/{run_id}/open")
         scenario_id = mock_data["scenarios"][0]["id"]
 
-        key_1 = "input"
-
-        steps = [
+        results = [
             {
-                "key": key_1,
+                "step_key": "input",
+                "repeat_idx": 3,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -349,48 +328,47 @@ def test_edit_evaluation_step(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["steps"][0]["key"] == key_1
-        assert response["steps"][0]["status"] == "pending"
+        assert response["results"][0]["step_key"] == "input"
+        assert response["results"][0]["status"] == "pending"
 
-        step = response["steps"][0]
-        result_id = step["id"]
+        result = response["results"][0]
+        result_id = result["id"]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        step["status"] = "success"
+        result["status"] = "success"
 
         response = authed_api(
             "PATCH",
             f"/preview/evaluations/results/{result_id}",
-            json={"step": step},
+            json={"result": result},
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        print(response)
         assert response["count"] == 1
-        assert response["step"]["id"] == result_id
-        assert response["step"]["status"] == "success"
+        assert response["result"]["id"] == result_id
+        assert response["result"]["status"] == "success"
         # ----------------------------------------------------------------------
 
-    def test_delete_evaluation_step(self, authed_api, mock_data):
+    def test_delete_evaluation_result(self, authed_api, mock_data):
         # ARRANGE --------------------------------------------------------------
         run_id = mock_data["runs"][0]["id"]
+        authed_api("POST", f"/preview/evaluations/runs/{run_id}/open")
         scenario_id = mock_data["scenarios"][0]["id"]
 
-        key_1 = "input"
-
-        steps = [
+        results = [
             {
-                "key": key_1,
+                "step_key": "input",
+                "repeat_idx": 4,
                 "scenario_id": scenario_id,
                 "run_id": run_id,
             },
@@ -399,14 +377,14 @@ def test_delete_evaluation_step(self, authed_api, mock_data):
         response = authed_api(
             "POST",
             "/preview/evaluations/results/",
-            json={"steps": steps},
+            json={"results": results},
         )
 
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
 
-        result_id = response["steps"][0]["id"]
+        result_id = response["results"][0]["id"]
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
diff --git a/api/oss/tests/pytest/evaluations/test_evaluation_steps_queries.py b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_queries.py
similarity index 71%
rename from api/oss/tests/pytest/evaluations/test_evaluation_steps_queries.py
rename to api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_queries.py
index 6cc2ce4405..1e8a016217 100644
--- a/api/oss/tests/pytest/evaluations/test_evaluation_steps_queries.py
+++ b/api/oss/tests/pytest/e2e/evaluations/test_evaluation_steps_queries.py
@@ -1,5 +1,3 @@
-from uuid import uuid4
-
 import pytest
 
 
@@ -7,7 +5,7 @@
 def mock_data(authed_api):
     # ARRANGE ------------------------------------------------------------------
     runs = [
-        {"name": "test_evaluation_steps_basics"},
+        {"name": "test_evaluation_steps_queries"},
     ]
 
     response = authed_api(
@@ -35,10 +33,8 @@ def mock_data(authed_api):
 
     scenarios = response.json()["scenarios"]
 
-    repeat_id_1 = str(uuid4())
-    retry_id_1 = str(uuid4())
-    repeat_id_2 = str(uuid4())
-    retry_id_2 = str(uuid4())
+    repeat_idx_1 = 0
+    repeat_idx_2 = 1
 
     tags = {
         "tag1": "value1",
@@ -50,11 +46,10 @@ def mock_data(authed_api):
         "meta2": "value2",
     }
 
-    steps = [
+    results = [
         {
-            "key": "input",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "input",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "success",
@@ -62,33 +57,29 @@ def mock_data(authed_api):
             "meta": meta,
         },
         {
-            "key": "invocation",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "invocation",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "failure",
         },
         {
-            "key": "annotation",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "annotation",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "cancelled",
         },
         {
-            "key": "input",
-            "repeat_id": repeat_id_2,
-            "retry_id": retry_id_2,
+            "step_key": "input",
+            "repeat_idx": repeat_idx_2,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "success",
         },
         {
-            "key": "invocation",
-            "repeat_id": repeat_id_2,
-            "retry_id": retry_id_2,
+            "step_key": "invocation",
+            "repeat_idx": repeat_idx_2,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "failure",
@@ -96,25 +87,22 @@ def mock_data(authed_api):
             "meta": meta,
         },
         {
-            "key": "annotation",
-            "repeat_id": repeat_id_2,
-            "retry_id": retry_id_2,
+            "step_key": "annotation",
+            "repeat_idx": repeat_idx_2,
             "scenario_id": scenarios[0]["id"],
             "run_id": run_1["id"],
             "status": "cancelled",
         },
         {
-            "key": "input",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "input",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[1]["id"],
             "run_id": run_1["id"],
             "status": "success",
         },
         {
-            "key": "invocation",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "invocation",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[1]["id"],
             "run_id": run_1["id"],
             "status": "failure",
@@ -122,9 +110,8 @@ def mock_data(authed_api):
             "meta": meta,
         },
         {
-            "key": "annotation",
-            "repeat_id": repeat_id_1,
-            "retry_id": retry_id_1,
+            "step_key": "annotation",
+            "repeat_idx": repeat_idx_1,
             "scenario_id": scenarios[1]["id"],
             "run_id": run_1["id"],
             "status": "cancelled",
@@ -134,20 +121,20 @@ def mock_data(authed_api):
     response = authed_api(
         "POST",
         "/preview/evaluations/results/",
-        json={"steps": steps},
+        json={"results": results},
     )
 
     assert response.status_code == 200
     response = response.json()
     assert response["count"] == 9
 
-    steps = response["steps"]
+    results = response["results"]
     # --------------------------------------------------------------------------
 
     _mock_data = {
         "runs": [run_1],
         "scenarios": scenarios,
-        "steps": steps,
+        "results": results,
     }
 
     return _mock_data
@@ -155,12 +142,16 @@ def mock_data(authed_api):
 
 class TestEvaluationResultsQueries:
     def test_query_results_all(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {},
+                "result": {
+                    "run_id": run_id,
+                },
             },
         )
         # ----------------------------------------------------------------------
@@ -172,38 +163,19 @@ def test_query_results_all(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_results_by_tags(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
+                    "run_id": run_id,
                     "tags": {
                         "tag1": "value1",
                         "tag2": "value2",
-                    }
-                },
-            },
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 3
-        # ----------------------------------------------------------------------
-
-    def test_query_results_by_meta(self, authed_api, mock_data):
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/results/query",
-            json={
-                "step": {
-                    "meta": {
-                        "meta1": "value1",
-                        "meta2": "value2",
-                    }
+                    },
                 },
             },
         )
@@ -221,7 +193,7 @@ def test_query_results_by_run_id(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
                     "run_id": mock_data["runs"][0]["id"],
                 },
             },
@@ -240,7 +212,7 @@ def test_query_results_by_run_ids(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
                     "run_ids": [mock_data["runs"][0]["id"]],
                 },
             },
@@ -259,7 +231,7 @@ def test_query_results_by_scenario_id(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
                     "scenario_id": mock_data["scenarios"][0]["id"],
                 },
             },
@@ -278,7 +250,7 @@ def test_query_results_by_scenario_ids(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
                     "scenario_ids": [s["id"] for s in mock_data["scenarios"]],
                 },
             },
@@ -297,8 +269,8 @@ def test_query_results_by_ids(self, authed_api, mock_data):
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
-                    "ids": [s["id"] for s in mock_data["steps"][:-1]],
+                "result": {
+                    "ids": [s["id"] for s in mock_data["results"][:-1]],
                 },
             },
         )
@@ -310,14 +282,17 @@ def test_query_results_by_ids(self, authed_api, mock_data):
         assert response["count"] == 9 - 1
         # ----------------------------------------------------------------------
 
-    def test_query_results_by_key(self, authed_api, mock_data):
+    def test_query_results_by_step_key(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
-                    "key": "input",
+                "result": {
+                    "run_id": run_id,
+                    "step_key": "input",
                 },
             },
         )
@@ -329,33 +304,17 @@ def test_query_results_by_key(self, authed_api, mock_data):
         assert response["count"] == 3
         # ----------------------------------------------------------------------
 
-    def test_query_results_by_keys(self, authed_api, mock_data):
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/results/query",
-            json={
-                "step": {
-                    "keys": ["input", "invocation"],
-                },
-            },
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 6
-        # ----------------------------------------------------------------------
+    def test_query_results_by_step_keys(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
 
-    def test_query_results_by_repeat_id(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
-                    "repeat_id": mock_data["steps"][0]["repeat_id"],
+                "result": {
+                    "run_id": run_id,
+                    "step_keys": ["input", "invocation"],
                 },
             },
         )
@@ -367,36 +326,17 @@ def test_query_results_by_repeat_id(self, authed_api, mock_data):
         assert response["count"] == 6
         # ----------------------------------------------------------------------
 
-    def test_query_results_by_repeat_ids(self, authed_api, mock_data):
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/evaluations/results/query",
-            json={
-                "step": {
-                    "repeat_ids": [
-                        mock_data["steps"][0]["repeat_id"],
-                        mock_data["steps"][3]["repeat_id"],
-                    ]
-                },
-            },
-        )
-        # ----------------------------------------------------------------------
+    def test_query_results_by_repeat_idx(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
 
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 9
-        # ----------------------------------------------------------------------
-
-    def test_query_results_by_retry_id(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
-                    "retry_id": mock_data["steps"][0]["retry_id"],
+                "result": {
+                    "run_id": run_id,
+                    "repeat_idx": mock_data["results"][0]["repeat_idx"],
                 },
             },
         )
@@ -408,17 +348,20 @@ def test_query_results_by_retry_id(self, authed_api, mock_data):
         assert response["count"] == 6
         # ----------------------------------------------------------------------
 
-    def test_query_results_by_retry_ids(self, authed_api, mock_data):
+    def test_query_results_by_repeat_idxs(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
-                    "retry_ids": [
-                        mock_data["steps"][0]["retry_id"],
-                        mock_data["steps"][3]["retry_id"],
-                    ]
+                "result": {
+                    "run_id": run_id,
+                    "repeat_idxs": [
+                        mock_data["results"][0]["repeat_idx"],
+                        mock_data["results"][3]["repeat_idx"],
+                    ],
                 },
             },
         )
@@ -431,12 +374,15 @@ def test_query_results_by_retry_ids(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_results_by_status(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
+                    "run_id": run_id,
                     "status": "success",
                 },
             },
@@ -450,12 +396,15 @@ def test_query_results_by_status(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_results_by_statuses(self, authed_api, mock_data):
+        run_id = mock_data["runs"][0]["id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/evaluations/results/query",
             json={
-                "step": {
+                "result": {
+                    "run_id": run_id,
                     "statuses": ["success", "failure"],
                 },
             },
diff --git a/api/oss/tests/pytest/evaluators/__init__.py b/api/oss/tests/pytest/e2e/evaluators/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/evaluators/__init__.py
rename to api/oss/tests/pytest/e2e/evaluators/__init__.py
diff --git a/api/oss/tests/pytest/evaluators/test_evaluators_basics.py b/api/oss/tests/pytest/e2e/evaluators/test_evaluators_basics.py
similarity index 100%
rename from api/oss/tests/pytest/evaluators/test_evaluators_basics.py
rename to api/oss/tests/pytest/e2e/evaluators/test_evaluators_basics.py
diff --git a/api/oss/tests/pytest/evaluators/test_evaluators_queries.py b/api/oss/tests/pytest/e2e/evaluators/test_evaluators_queries.py
similarity index 66%
rename from api/oss/tests/pytest/evaluators/test_evaluators_queries.py
rename to api/oss/tests/pytest/e2e/evaluators/test_evaluators_queries.py
index f656123133..17fca07b9e 100644
--- a/api/oss/tests/pytest/evaluators/test_evaluators_queries.py
+++ b/api/oss/tests/pytest/e2e/evaluators/test_evaluators_queries.py
@@ -134,8 +134,8 @@ def test_query_non_archived_evaluators(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={},
         )
         # ----------------------------------------------------------------------
@@ -143,8 +143,9 @@ def test_query_non_archived_evaluators(
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 1
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][0]["id"]
+        evaluator_ids = [e["id"] for e in response["evaluators"]]
+        assert mock_data["evaluators"][0]["id"] in evaluator_ids
+        assert mock_data["evaluators"][1]["id"] not in evaluator_ids  # archived
         # ----------------------------------------------------------------------
 
     def test_query_all_evaluators(
@@ -154,8 +155,8 @@ def test_query_all_evaluators(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={
                 "include_archived": True,
             },
@@ -165,10 +166,9 @@ def test_query_all_evaluators(
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 2
-        assert len(response["evaluators"]) == 2
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][0]["id"]
-        assert response["evaluators"][1]["id"] == mock_data["evaluators"][1]["id"]
+        evaluator_ids = [e["id"] for e in response["evaluators"]]
+        assert mock_data["evaluators"][0]["id"] in evaluator_ids
+        assert mock_data["evaluators"][1]["id"] in evaluator_ids
         # ----------------------------------------------------------------------
 
     def test_query_paginated_evaluators(
@@ -177,53 +177,57 @@ def test_query_paginated_evaluators(
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
+        # First, get total count with include_archived
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={
                 "include_archived": True,
-                "windowing": {"limit": 1},
             },
         )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][0]["id"]
+        total_evaluators = response.json()["evaluators"]
+        total_count = len(total_evaluators)
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
-            json={
-                "include_archived": True,
-                "windowing": {"limit": 1, "next": response["evaluators"][0]["id"]},
-            },
-        )
+        # Page through all evaluators one by one
+        seen_ids = []
+        next_cursor = None
+        for _ in range(total_count):
+            windowing = {"limit": 1}
+            if next_cursor:
+                windowing["next"] = next_cursor
+            response = authed_api(
+                "POST",
+                "/preview/simple/evaluators/query",
+                json={
+                    "include_archived": True,
+                    "windowing": windowing,
+                },
+            )
+            assert response.status_code == 200
+            response = response.json()
+            assert response["count"] == 1
+            seen_ids.append(response["evaluators"][0]["id"])
+            next_cursor = response["evaluators"][0]["id"]
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][1]["id"]
-        # ----------------------------------------------------------------------
+        # Verify all evaluators were seen
+        assert len(seen_ids) == total_count
+        for e in total_evaluators:
+            assert e["id"] in seen_ids
 
-        # ACT ------------------------------------------------------------------
+        # Verify next page is empty
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={
                 "include_archived": True,
-                "windowing": {"limit": 1, "next": response["evaluators"][0]["id"]},
+                "windowing": {"limit": 1, "next": next_cursor},
             },
         )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 0
@@ -235,12 +239,13 @@ def test_query_evaluators_by_flags(
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
-        # flags = quote(dumps(mock_data["evaluators"][0]["flags"]))
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={
-                "flags": mock_data["evaluators"][0]["flags"],
+                "evaluator": {
+                    "flags": mock_data["evaluators"][0]["flags"],
+                },
             },
         )
         # ----------------------------------------------------------------------
@@ -248,8 +253,9 @@ def test_query_evaluators_by_flags(
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 1
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][0]["id"]
+        assert response["count"] >= 1
+        evaluator_ids = [e["id"] for e in response["evaluators"]]
+        assert mock_data["evaluators"][0]["id"] in evaluator_ids
         # ----------------------------------------------------------------------
 
     def test_query_evaluators_by_tags(
@@ -258,35 +264,13 @@ def test_query_evaluators_by_tags(
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
-        # tags = quote(dumps(mock_data["evaluators"][0]["tags"]))
-        response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME,
-            json={
-                "tags": mock_data["evaluators"][0]["tags"],
-            },
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["evaluators"][0]["id"] == mock_data["evaluators"][0]["id"]
-        # ----------------------------------------------------------------------
-
-    def test_query_evaluators_by_meta(
-        self,
-        authed_api,
-        mock_data,
-    ):
-        # ACT ------------------------------------------------------------------
-        # meta = quote(dumps(mock_data["evaluators"][0]["meta"]))
         response = authed_api(
-            "POST",  # TODO: FIX ME
-            "/preview/simple/evaluators/query",  # TODO: FIX ME
+            "POST",
+            "/preview/simple/evaluators/query",
             json={
-                "meta": mock_data["evaluators"][0]["meta"],
+                "evaluator": {
+                    "tags": mock_data["evaluators"][0]["tags"],
+                },
             },
         )
         # ----------------------------------------------------------------------
diff --git a/api/oss/tests/pytest/healthchecks/__init__.py b/api/oss/tests/pytest/e2e/healthchecks/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/healthchecks/__init__.py
rename to api/oss/tests/pytest/e2e/healthchecks/__init__.py
diff --git a/api/oss/tests/pytest/healthchecks/test_healthchecks.py b/api/oss/tests/pytest/e2e/healthchecks/test_healthchecks.py
similarity index 100%
rename from api/oss/tests/pytest/healthchecks/test_healthchecks.py
rename to api/oss/tests/pytest/e2e/healthchecks/test_healthchecks.py
diff --git a/api/oss/tests/pytest/testsets/__init__.py b/api/oss/tests/pytest/e2e/testsets/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/testsets/__init__.py
rename to api/oss/tests/pytest/e2e/testsets/__init__.py
diff --git a/api/oss/tests/pytest/testsets/test_testcases_basics.py b/api/oss/tests/pytest/e2e/testsets/test_testcases_basics.py
similarity index 90%
rename from api/oss/tests/pytest/testsets/test_testcases_basics.py
rename to api/oss/tests/pytest/e2e/testsets/test_testcases_basics.py
index 5e7c91f04f..afaf4428ce 100644
--- a/api/oss/tests/pytest/testsets/test_testcases_basics.py
+++ b/api/oss/tests/pytest/e2e/testsets/test_testcases_basics.py
@@ -95,29 +95,35 @@ def test_fetch_testcase(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/simple/testsets/testcases/{testcase_id}",
+            f"/preview/testcases/{testcase_id}",
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        print(response)
-        assert response["testcase"] == testcases[0]
+        assert response["testcase"]["id"] == testcase_id
+        assert response["testcase"]["data"] == testcases[0]["data"]
         # ----------------------------------------------------------------------
 
     def test_list_testcases(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
+        testset = mock_data["testsets"][0]
+        testset_id = testset["id"]
+
         response = authed_api(
-            "GET",
-            "/preview/simple/testsets/testcases/",
+            "POST",
+            "/preview/testcases/query",
+            json={
+                "testset_id": testset_id,
+            },
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 6
+        assert response["count"] == len(testset["data"]["testcases"])
         # ----------------------------------------------------------------------
 
     def test_query_testcases_by_testcase_ids(self, authed_api, mock_data):
@@ -128,7 +134,7 @@ def test_query_testcases_by_testcase_ids(self, authed_api, mock_data):
 
         response = authed_api(
             "POST",
-            "/preview/simple/testsets/testcases/query",
+            "/preview/testcases/query",
             json={
                 "testcase_ids": testcase_ids,
             },
@@ -148,7 +154,7 @@ def test_query_testcases_by_testset_id(self, authed_api, mock_data):
 
         response = authed_api(
             "POST",
-            "/preview/simple/testsets/testcases/query",
+            "/preview/testcases/query",
             json={
                 "testset_id": testset_id,
             },
diff --git a/api/oss/tests/pytest/testsets/test_testsets_basics.py b/api/oss/tests/pytest/e2e/testsets/test_testsets_basics.py
similarity index 100%
rename from api/oss/tests/pytest/testsets/test_testsets_basics.py
rename to api/oss/tests/pytest/e2e/testsets/test_testsets_basics.py
diff --git a/api/oss/tests/pytest/testsets/test_testsets_files.py b/api/oss/tests/pytest/e2e/testsets/test_testsets_files.py
similarity index 100%
rename from api/oss/tests/pytest/testsets/test_testsets_files.py
rename to api/oss/tests/pytest/e2e/testsets/test_testsets_files.py
diff --git a/api/oss/tests/pytest/testsets/test_testsets_queries.py b/api/oss/tests/pytest/e2e/testsets/test_testsets_queries.py
similarity index 85%
rename from api/oss/tests/pytest/testsets/test_testsets_queries.py
rename to api/oss/tests/pytest/e2e/testsets/test_testsets_queries.py
index f241d0f7c2..aeb3e6fc83 100644
--- a/api/oss/tests/pytest/testsets/test_testsets_queries.py
+++ b/api/oss/tests/pytest/e2e/testsets/test_testsets_queries.py
@@ -89,29 +89,35 @@ class TestTestsetsQueries:
     def test_list_testsets(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/simple/testsets/",
+            "POST",
+            "/preview/simple/testsets/query",
+            json={},
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 1
+        testset_ids = [t["id"] for t in response["testsets"]]
+        assert mock_data["testsets"][0]["id"] in testset_ids
+        assert mock_data["testsets"][1]["id"] not in testset_ids  # archived
         # ----------------------------------------------------------------------
 
     def test_query_testsets_non_archived(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/simple/testsets/",
+            "POST",
+            "/preview/simple/testsets/query",
+            json={},
         )
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 1
+        testset_ids = [t["id"] for t in response["testsets"]]
+        assert mock_data["testsets"][0]["id"] in testset_ids
+        assert mock_data["testsets"][1]["id"] not in testset_ids  # archived
         # ----------------------------------------------------------------------
 
     def test_query_testsets_all(self, authed_api, mock_data):
@@ -128,7 +134,9 @@ def test_query_testsets_all(self, authed_api, mock_data):
         # ASSERT ---------------------------------------------------------------
         assert response.status_code == 200
         response = response.json()
-        assert response["count"] == 2
+        testset_ids = [t["id"] for t in response["testsets"]]
+        assert mock_data["testsets"][0]["id"] in testset_ids
+        assert mock_data["testsets"][1]["id"] in testset_ids
         # ----------------------------------------------------------------------
 
     def test_query_testsets_by_tags(self, authed_api, mock_data):
@@ -153,28 +161,6 @@ def test_query_testsets_by_tags(self, authed_api, mock_data):
         assert response["testsets"][0]["id"] == mock_data["testsets"][0]["id"]
         # ----------------------------------------------------------------------
 
-    def test_query_testsets_by_meta(self, authed_api, mock_data):
-        # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/simple/testsets/query",
-            json={
-                "testset": {
-                    "meta": {
-                        "meta1": "value1",
-                    },
-                },
-            },
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["testsets"][0]["id"] == mock_data["testsets"][0]["id"]
-        # ----------------------------------------------------------------------
-
     def test_query_testsets_by_refs(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
         response = authed_api(
diff --git a/api/oss/tests/pytest/tracing/__init__.py b/api/oss/tests/pytest/e2e/tracing/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/tracing/__init__.py
rename to api/oss/tests/pytest/e2e/tracing/__init__.py
diff --git a/api/oss/tests/pytest/tracing/test_spans_basics.py b/api/oss/tests/pytest/e2e/tracing/test_spans_basics.py
similarity index 93%
rename from api/oss/tests/pytest/tracing/test_spans_basics.py
rename to api/oss/tests/pytest/e2e/tracing/test_spans_basics.py
index 040e916473..9524f384f3 100644
--- a/api/oss/tests/pytest/tracing/test_spans_basics.py
+++ b/api/oss/tests/pytest/e2e/tracing/test_spans_basics.py
@@ -1,6 +1,18 @@
+import time
 from uuid import uuid4
 
 
+def _wait_for_spans(authed_api, *, max_retries=15, delay=0.5):
+    """Poll until spans appear in the DB."""
+    resp = None
+    for _ in range(max_retries):
+        resp = authed_api("POST", "/preview/tracing/spans/query")
+        if resp.status_code == 200 and resp.json()["count"] != 0:
+            return resp
+        time.sleep(delay)
+    return resp
+
+
 class TestSpansBasics:
     trace_ids = [
         "1234567890abcdef1234567890abc000",
@@ -15,7 +27,7 @@ def test_ingest_spans(self, authed_api):
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
-            "/preview/tracing/spans/",
+            "/preview/tracing/spans/ingest",
             json={
                 "spans": [
                     {
@@ -30,8 +42,8 @@ def test_ingest_spans(self, authed_api):
                         "attributes": {
                             "ag": {
                                 "type": {
-                                    "trace": "undefined",
-                                    "span": "undefined",
+                                    "trace": "unknown",
+                                    "span": "unknown",
                                     "extra_type": "x",  # unsupported
                                 },
                                 "flags": {"env": True},
@@ -135,8 +147,8 @@ def test_query_spans(self, authed_api):
                         "attributes": {
                             "ag": {
                                 "type": {
-                                    "trace": "undefined",
-                                    "span": "undefined",
+                                    "trace": "unknown",
+                                    "span": "unknown",
                                     "extra_type": "x",  # unsupported
                                 },
                                 "flags": {"env": True},
@@ -210,10 +222,7 @@ def test_query_spans(self, authed_api):
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "POST",
-            "/preview/tracing/spans/query",
-        )
+        response = _wait_for_spans(authed_api)
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
diff --git a/api/oss/tests/pytest/tracing/test_spans_queries.py b/api/oss/tests/pytest/e2e/tracing/test_spans_queries.py
similarity index 73%
rename from api/oss/tests/pytest/tracing/test_spans_queries.py
rename to api/oss/tests/pytest/e2e/tracing/test_spans_queries.py
index f257cac565..5e0329d6db 100644
--- a/api/oss/tests/pytest/tracing/test_spans_queries.py
+++ b/api/oss/tests/pytest/e2e/tracing/test_spans_queries.py
@@ -1,24 +1,47 @@
+import time
 from uuid import uuid4
 
 
 import pytest
 
 
+TRACE_ID = uuid4().hex
+
+
+def _wait_for_spans(authed_api, trace_id, *, expected=1, max_retries=15, delay=0.5):
+    """Poll until spans with the given trace_id appear in the DB."""
+    resp = None
+    for _ in range(max_retries):
+        resp = authed_api(
+            "POST",
+            "/preview/tracing/spans/query",
+            json={
+                "focus": "span",
+                "filter": {
+                    "conditions": [
+                        {
+                            "field": "trace_id",
+                            "operator": "is",
+                            "value": trace_id,
+                        }
+                    ]
+                },
+            },
+        )
+        if resp.status_code == 200 and resp.json().get("count", 0) >= expected:
+            return resp
+        time.sleep(delay)
+    return resp
+
+
 @pytest.fixture(scope="class")
 def mock_data(authed_api):
-    trace_ids = [
-        "1234567890abcdef1234567890abc000",
-        "1234567890abcdef1234567890abc001",
-        "1234567890abcdef1234567890abc002",
-        "1234567890abcdef1234567890abc003",
-        "1234567890abcdef1234567890abc004",
-        "1234567890abcdef1234567890abc005",
-    ]
+    trace_id = TRACE_ID
 
     # ARRANGE ------------------------------------------------------------------
     spans = [
         {
-            "trace_id": trace_ids[0],
+            "trace_id": trace_id,
             "span_id": "abcdef1234567890",
             "span_name": "parent_span",
             "span_kind": "SPAN_KIND_SERVER",
@@ -29,8 +52,8 @@ def mock_data(authed_api):
             "attributes": {
                 "ag": {
                     "type": {
-                        "trace": "undefined",
-                        "span": "undefined",
+                        "trace": "unknown",
+                        "span": "unknown",
                         "extra_type": "x",  # unsupported
                     },
                     "flags": {"env": True},
@@ -85,7 +108,7 @@ def mock_data(authed_api):
             ],
         },
         {
-            "trace_id": trace_ids[0],
+            "trace_id": trace_id,
             "span_id": "1234567890abcdef",
             "parent_id": "abcdef1234567890",
             "span_name": "child_span",
@@ -105,26 +128,42 @@ def mock_data(authed_api):
     ]
     response = authed_api(
         "POST",
-        "/preview/tracing/spans/",
+        "/preview/tracing/spans/ingest",
         json={"spans": spans},
     )
 
     assert response.status_code == 202
     response = response.json()
     assert response["count"] == 2
+
+    _wait_for_spans(authed_api, trace_id, expected=2)
     # --------------------------------------------------------------------------
 
-    _mock_data = {"spans": spans}
+    _mock_data = {"spans": spans, "trace_id": trace_id}
 
     return _mock_data
 
 
-class TestSpansBasics:
+class TestSpansQueries:
     def test_query_all(self, authed_api, mock_data):
+        trace_id = mock_data["trace_id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/tracing/spans/query",
+            json={
+                "focus": "span",
+                "filter": {
+                    "conditions": [
+                        {
+                            "field": "trace_id",
+                            "operator": "is",
+                            "value": trace_id,
+                        }
+                    ]
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -136,20 +175,28 @@ def test_query_all(self, authed_api, mock_data):
         # ----------------------------------------------------------------------
 
     def test_query_fts(self, authed_api, mock_data):
+        trace_id = mock_data["trace_id"]
+
         # ACT ------------------------------------------------------------------
         response = authed_api(
             "POST",
             "/preview/tracing/spans/query",
             json={
+                "focus": "span",
                 "filter": {
                     "conditions": [
+                        {
+                            "field": "trace_id",
+                            "operator": "is",
+                            "value": trace_id,
+                        },
                         {
                             "field": "content",
                             "operator": "contains",
                             "value": "hello world",
-                        }
+                        },
                     ]
-                }
+                },
             },
         )
         # ----------------------------------------------------------------------
diff --git a/api/oss/tests/pytest/tracing/test_traces_basics.py b/api/oss/tests/pytest/e2e/tracing/test_traces_basics.py
similarity index 91%
rename from api/oss/tests/pytest/tracing/test_traces_basics.py
rename to api/oss/tests/pytest/e2e/tracing/test_traces_basics.py
index a02866adce..a72c13f94b 100644
--- a/api/oss/tests/pytest/tracing/test_traces_basics.py
+++ b/api/oss/tests/pytest/e2e/tracing/test_traces_basics.py
@@ -1,6 +1,18 @@
+import time
 from uuid import uuid4
 
 
+def _wait_for_trace(authed_api, trace_id, *, expect_count=1, max_retries=15, delay=0.5):
+    """Poll until the trace appears (or disappears) in the DB."""
+    resp = None
+    for _ in range(max_retries):
+        resp = authed_api("GET", f"/preview/tracing/traces/{trace_id}")
+        if resp.status_code == 200 and resp.json()["count"] == expect_count:
+            return resp
+        time.sleep(delay)
+    return resp
+
+
 class TestTraceBasics:
     def test_create_trace(self, authed_api):
         # ACT ------------------------------------------------------------------
@@ -29,7 +41,10 @@ def test_create_trace(self, authed_api):
                                     "some.number": 123,
                                     "some.boolean": True,
                                     "some.array": [1, 2, 3],
-                                    "some.object": {"key1": "value1", "key2": "value2"},
+                                    "some.object": {
+                                        "key1": "value1",
+                                        "key2": "value2",
+                                    },
                                     "some.more.array.0": "array-value-0",
                                     "some.more.array.1": "array-value-1",
                                     "some.more.array.2": "array-value-2",
@@ -101,7 +116,10 @@ def test_fetch_trace(self, authed_api):
                             "some.number": 123,
                             "some.boolean": True,
                             "some.array": [1, 2, 3],
-                            "some.object": {"key1": "value1", "key2": "value2"},
+                            "some.object": {
+                                "key1": "value1",
+                                "key2": "value2",
+                            },
                             "some.more.array.0": "array-value-0",
                             "some.more.array.1": "array-value-1",
                             "some.more.array.2": "array-value-2",
@@ -124,10 +142,7 @@ def test_fetch_trace(self, authed_api):
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "GET",
-            f"/preview/tracing/traces/{trace_id}",
-        )
+        response = _wait_for_trace(authed_api, trace_id, expect_count=1)
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
@@ -176,6 +191,8 @@ def test_edit_trace(self, authed_api):
         assert response.status_code == 202
         response = response.json()
         assert response["count"] == 2
+
+        _wait_for_trace(authed_api, trace_id, expect_count=1)
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
@@ -200,7 +217,10 @@ def test_edit_trace(self, authed_api):
                                     "some.number": 123,
                                     "some.boolean": True,
                                     "some.array": [1, 2, 3],
-                                    "some.object": {"key1": "value1", "key2": "value2"},
+                                    "some.object": {
+                                        "key1": "value1",
+                                        "key2": "value2",
+                                    },
                                     "some.more.array.0": "array-value-0",
                                     "some.more.array.1": "array-value-1",
                                     "some.more.array.2": "array-value-2",
@@ -275,6 +295,8 @@ def test_delete_trace(self, authed_api):
         assert response.status_code == 202
         response = response.json()
         assert response["count"] == 1
+
+        _wait_for_trace(authed_api, trace_id, expect_count=1)
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
@@ -289,10 +311,7 @@ def test_delete_trace(self, authed_api):
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        response = authed_api(
-            "GET",
-            f"/preview/tracing/traces/{trace_id}",
-        )
+        response = _wait_for_trace(authed_api, trace_id, expect_count=0)
         # ----------------------------------------------------------------------
 
         # ASSERT ---------------------------------------------------------------
diff --git a/api/oss/tests/pytest/workflows/__init__.py b/api/oss/tests/pytest/e2e/workflows/__init__.py
similarity index 100%
rename from api/oss/tests/pytest/workflows/__init__.py
rename to api/oss/tests/pytest/e2e/workflows/__init__.py
diff --git a/api/oss/tests/pytest/workflows/test_workflow_lineage.py b/api/oss/tests/pytest/e2e/workflows/test_workflow_lineage.py
similarity index 96%
rename from api/oss/tests/pytest/workflows/test_workflow_lineage.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflow_lineage.py
index c11178906b..b99ae4e6a4 100644
--- a/api/oss/tests/pytest/workflows/test_workflow_lineage.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflow_lineage.py
@@ -177,8 +177,13 @@ def mock_data(authed_api):
     assert response.status_code == 200
 
     response = authed_api(
-        "GET",
-        "/preview/workflows/revisions/",
+        "POST",
+        "/preview/workflows/revisions/query",
+        json={
+            "workflow_revision": {
+                "workflow_variant_id": workflow_variant_id,
+            },
+        },
     )
 
     assert response.status_code == 200
@@ -241,7 +246,9 @@ def test_log_last_workflow_revisions_by_variant(self, authed_api, mock_data):
 
     def test_log_all_workflow_revisions(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
-        workflow_revision = mock_data["workflow_revisions"][-1]
+        # Find the revision with the highest version (the latest explicit commit)
+        revisions = mock_data["workflow_revisions"]
+        workflow_revision = max(revisions, key=lambda r: r.get("version", 0))
 
         response = authed_api(
             "POST",
@@ -262,7 +269,8 @@ def test_log_all_workflow_revisions(self, authed_api, mock_data):
 
     def test_log_last_workflow_revisions(self, authed_api, mock_data):
         # ACT ------------------------------------------------------------------
-        workflow_revision = mock_data["workflow_revisions"][-1]
+        revisions = mock_data["workflow_revisions"]
+        workflow_revision = max(revisions, key=lambda r: r.get("version", 0))
 
         response = authed_api(
             "POST",
diff --git a/api/oss/tests/pytest/workflows/test_workflow_revisions_basics.py b/api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_basics.py
similarity index 99%
rename from api/oss/tests/pytest/workflows/test_workflow_revisions_basics.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_basics.py
index 44ea13f912..98aa870f79 100644
--- a/api/oss/tests/pytest/workflows/test_workflow_revisions_basics.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_basics.py
@@ -450,7 +450,7 @@ def test_commit_workflow_revision(
             "/preview/workflows/revisions/commit",
             json={
                 "workflow_revision": {
-                    "id": workflow_revision_id,
+                    "revision_id": workflow_revision_id,
                     "slug": f"workflow-revision-new-{workflow_revision_slug}",
                     "name": f"Workflow revision new {workflow_revision_slug}",
                     "description": "Workflow revision new Description",
diff --git a/api/oss/tests/pytest/workflows/test_workflow_revisions_queries.py b/api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_queries.py
similarity index 69%
rename from api/oss/tests/pytest/workflows/test_workflow_revisions_queries.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_queries.py
index 2f4121c8e3..e5680314b9 100644
--- a/api/oss/tests/pytest/workflows/test_workflow_revisions_queries.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflow_revisions_queries.py
@@ -1,6 +1,4 @@
 from uuid import uuid4
-from json import dumps
-from urllib.parse import quote
 
 import pytest
 
@@ -8,6 +6,8 @@
 @pytest.fixture(scope="class")
 def mock_data(authed_api):
     # ARRANGE ------------------------------------------------------------------
+    unique_marker = uuid4().hex[:8]
+
     workflow_slug = uuid4()
 
     workflow = {
@@ -94,11 +94,13 @@ def mock_data(authed_api):
                     "tag1": "value1",
                     "tag2": "value2",
                     "tag3": "value3",
+                    "_marker": unique_marker,
                 },
                 "meta": {
                     "meta1": "value1",
                     "meta2": "value2",
                     "meta3": "value3",
+                    "_marker": unique_marker,
                 },
                 "workflow_id": workflow_id,
                 "workflow_variant_id": workflow_variant_id,
@@ -108,7 +110,7 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_revision_id_0 = response.json()["workflow_revision"]["id"]
+    workflow_revision_0 = response.json()["workflow_revision"]
 
     workflow_revision_slug = uuid4()
 
@@ -129,11 +131,13 @@ def mock_data(authed_api):
                     "tag1": "value3",
                     "tag2": "value2",
                     "tag3": "value1",
+                    "_marker": unique_marker,
                 },
                 "meta": {
                     "meta1": "value3",
                     "meta2": "value2",
                     "meta3": "value1",
+                    "_marker": unique_marker,
                 },
                 "workflow_id": workflow_id,
                 "workflow_variant_id": workflow_variant_id,
@@ -143,29 +147,39 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_revision_id_1 = response.json()["workflow_revision"]["id"]
+    workflow_revision_1 = response.json()["workflow_revision"]
 
     response = authed_api(
         "POST",
-        f"/preview/workflows/revisions/{workflow_revision_id_1}/archive",
+        f"/preview/workflows/revisions/{workflow_revision_1['id']}/archive",
     )
 
     assert response.status_code == 200
 
     response = authed_api(
-        "GET",
-        "/preview/workflows/revisions/?include_archived=true",
+        "POST",
+        "/preview/workflows/revisions/query",
+        json={
+            "include_archived": True,
+            "workflow_revision": {"tags": {"_marker": unique_marker}},
+        },
     )
 
     assert response.status_code == 200
     response = response.json()
 
     assert response["count"] == 2
-    assert response["workflow_revisions"][0]["id"] == workflow_revision_id_0
-    assert response["workflow_revisions"][1]["id"] == workflow_revision_id_1
+    rev_ids = {r["id"] for r in response["workflow_revisions"]}
+    assert workflow_revision_0["id"] in rev_ids
+    assert workflow_revision_1["id"] in rev_ids
     # --------------------------------------------------------------------------
 
-    return response
+    _mock_data = {
+        "workflow_revisions": [workflow_revision_0, workflow_revision_1],
+        "_marker": unique_marker,
+    }
+
+    return _mock_data
 
 
 class TestWorkflowRevisionsQueries:
@@ -176,8 +190,11 @@ def test_query_non_archived_workflow_revisions(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/revisions/",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "workflow_revision": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -198,8 +215,12 @@ def test_query_all_workflow_revisions(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/revisions/?include_archived=true",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "include_archived": True,
+                "workflow_revision": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -207,10 +228,9 @@ def test_query_all_workflow_revisions(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 2
-        assert (
-            response["workflow_revisions"][0]["id"]
-            == mock_data["workflow_revisions"][0]["id"]
-        )
+        rev_ids = {r["id"] for r in response["workflow_revisions"]}
+        assert mock_data["workflow_revisions"][0]["id"] in rev_ids
+        assert mock_data["workflow_revisions"][1]["id"] in rev_ids
         # ----------------------------------------------------------------------
 
     def test_query_paginated_workflow_revisions(
@@ -218,10 +238,18 @@ def test_query_paginated_workflow_revisions(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
+        marker = mock_data["_marker"]
+        expected_ids = {r["id"] for r in mock_data["workflow_revisions"]}
+
+        # ACT — page 1 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/revisions/?include_archived=true&limit=1",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "include_archived": True,
+                "workflow_revision": {"tags": {"_marker": marker}},
+                "windowing": {"limit": 1},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -229,17 +257,21 @@ def test_query_paginated_workflow_revisions(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert (
-            response["workflow_revisions"][0]["id"]
-            == mock_data["workflow_revisions"][0]["id"]
-        )
+        seen_ids = {response["workflow_revisions"][0]["id"]}
         # ----------------------------------------------------------------------
 
-        # ACT ------------------------------------------------------------------
+        # ACT — page 2 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/revisions/?include_archived=true"
-            f"&limit=1&next={response['workflow_revisions'][0]['id']}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "include_archived": True,
+                "workflow_revision": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflow_revisions"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -247,16 +279,22 @@ def test_query_paginated_workflow_revisions(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert (
-            response["workflow_revisions"][0]["id"]
-            == mock_data["workflow_revisions"][1]["id"]
-        )
+        seen_ids.add(response["workflow_revisions"][0]["id"])
+        assert seen_ids == expected_ids
         # ----------------------------------------------------------------------
 
+        # ACT — page 3 (empty) ------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/revisions/?include_archived=true"
-            f"&limit=1&next={response['workflow_revisions'][0]['id']}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "include_archived": True,
+                "workflow_revision": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflow_revisions"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -271,49 +309,18 @@ def test_query_workflow_revisions_by_flags(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
-        flags = quote(dumps(mock_data["workflow_revisions"][0]["flags"]))
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert (
-            response["workflow_revisions"][0]["id"]
-            == mock_data["workflow_revisions"][0]["id"]
-        )
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        flags = quote(dumps({"is_custom": True}))
-
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 0
-        # ----------------------------------------------------------------------
+        marker = mock_data["_marker"]
 
-    def test_query_workflow_revisions_by_tags(
-        self,
-        authed_api,
-        mock_data,
-    ):
         # ACT ------------------------------------------------------------------
-        tags = quote(dumps(mock_data["workflow_revisions"][0]["tags"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?tags={tags}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "workflow_revision": {
+                    "flags": mock_data["workflow_revisions"][0]["flags"],
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -328,11 +335,15 @@ def test_query_workflow_revisions_by_tags(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        tags = quote(dumps({"tag1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?tags={tags}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "workflow_revision": {
+                    "flags": {"is_custom": True},
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -342,16 +353,20 @@ def test_query_workflow_revisions_by_tags(
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_query_workflow_revisions_by_meta(
+    def test_query_workflow_revisions_by_tags(
         self,
         authed_api,
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps(mock_data["workflow_revisions"][0]["meta"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?meta={meta}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "workflow_revision": {
+                    "tags": mock_data["workflow_revisions"][0]["tags"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -366,11 +381,14 @@ def test_query_workflow_revisions_by_meta(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps({"meta1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/revisions/?meta={meta}",
+            "POST",
+            "/preview/workflows/revisions/query",
+            json={
+                "workflow_revision": {
+                    "tags": {"tag1": "nonexistent_value"},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
diff --git a/api/oss/tests/pytest/workflows/test_workflow_variants_basics.py b/api/oss/tests/pytest/e2e/workflows/test_workflow_variants_basics.py
similarity index 100%
rename from api/oss/tests/pytest/workflows/test_workflow_variants_basics.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflow_variants_basics.py
diff --git a/api/oss/tests/pytest/workflows/test_workflow_variants_queries.py b/api/oss/tests/pytest/e2e/workflows/test_workflow_variants_queries.py
similarity index 66%
rename from api/oss/tests/pytest/workflows/test_workflow_variants_queries.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflow_variants_queries.py
index b294b45116..e6b911e28d 100644
--- a/api/oss/tests/pytest/workflows/test_workflow_variants_queries.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflow_variants_queries.py
@@ -1,6 +1,4 @@
 from uuid import uuid4
-from json import dumps
-from urllib.parse import quote
 
 import pytest
 
@@ -8,6 +6,8 @@
 @pytest.fixture(scope="class")
 def mock_data(authed_api):
     # ARRANGE --------------------------------------------------------------
+    unique_marker = uuid4().hex[:8]
+
     workflow_slug = uuid4()
 
     workflow = {
@@ -60,11 +60,13 @@ def mock_data(authed_api):
                     "tag1": "value1",
                     "tag2": "value2",
                     "tag3": "value3",
+                    "_marker": unique_marker,
                 },
                 "meta": {
                     "meta1": "value1",
                     "meta2": "value2",
                     "meta3": "value3",
+                    "_marker": unique_marker,
                 },
                 "workflow_id": workflow_id,
             }
@@ -73,7 +75,7 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_variant_id_0 = response.json()["workflow_variant"]["id"]
+    workflow_variant_0 = response.json()["workflow_variant"]
 
     workflow_variant_slug = uuid4()
 
@@ -94,11 +96,13 @@ def mock_data(authed_api):
                     "tag1": "value1",
                     "tag2": "value2",
                     "tag3": "value3",
+                    "_marker": unique_marker,
                 },
                 "meta": {
                     "meta1": "value1",
                     "meta2": "value2",
                     "meta3": "value3",
+                    "_marker": unique_marker,
                 },
                 "workflow_id": workflow_id,
             }
@@ -107,29 +111,39 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_variant_id_1 = response.json()["workflow_variant"]["id"]
+    workflow_variant_1 = response.json()["workflow_variant"]
 
     response = authed_api(
         "POST",
-        f"/preview/workflows/variants/{workflow_variant_id_1}/archive",
+        f"/preview/workflows/variants/{workflow_variant_1['id']}/archive",
     )
 
     assert response.status_code == 200
 
     response = authed_api(
-        "GET",
-        "/preview/workflows/variants/?include_archived=true",
+        "POST",
+        "/preview/workflows/variants/query",
+        json={
+            "include_archived": True,
+            "workflow_variant": {"tags": {"_marker": unique_marker}},
+        },
     )
 
     assert response.status_code == 200
     response = response.json()
 
     assert response["count"] == 2
-    assert response["workflow_variants"][0]["id"] == workflow_variant_id_0
-    assert response["workflow_variants"][1]["id"] == workflow_variant_id_1
+    variant_ids = {v["id"] for v in response["workflow_variants"]}
+    assert workflow_variant_0["id"] in variant_ids
+    assert workflow_variant_1["id"] in variant_ids
     # --------------------------------------------------------------------------
 
-    return response
+    _mock_data = {
+        "workflow_variants": [workflow_variant_0, workflow_variant_1],
+        "_marker": unique_marker,
+    }
+
+    return _mock_data
 
 
 class TestWorkflowVariantsQueries:
@@ -140,8 +154,11 @@ def test_query_non_archived_workflow_variants(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/variants/",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "workflow_variant": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -162,8 +179,12 @@ def test_query_all_workflow_variants(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/variants/?include_archived=true",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "include_archived": True,
+                "workflow_variant": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -171,10 +192,9 @@ def test_query_all_workflow_variants(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 2
-        assert (
-            response["workflow_variants"][0]["id"]
-            == mock_data["workflow_variants"][0]["id"]
-        )
+        variant_ids = {v["id"] for v in response["workflow_variants"]}
+        assert mock_data["workflow_variants"][0]["id"] in variant_ids
+        assert mock_data["workflow_variants"][1]["id"] in variant_ids
         # ----------------------------------------------------------------------
 
     def test_query_paginated_workflow_variants(
@@ -182,10 +202,18 @@ def test_query_paginated_workflow_variants(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
+        marker = mock_data["_marker"]
+        expected_ids = {v["id"] for v in mock_data["workflow_variants"]}
+
+        # ACT — page 1 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/variants/?include_archived=true&limit=1",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "include_archived": True,
+                "workflow_variant": {"tags": {"_marker": marker}},
+                "windowing": {"limit": 1},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -193,17 +221,21 @@ def test_query_paginated_workflow_variants(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert (
-            response["workflow_variants"][0]["id"]
-            == mock_data["workflow_variants"][0]["id"]
-        )
+        seen_ids = {response["workflow_variants"][0]["id"]}
         # ----------------------------------------------------------------------
 
-        # ACT ------------------------------------------------------------------
+        # ACT — page 2 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/variants/?include_archived=true"
-            f"&limit=1&next={response['workflow_variants'][0]['id']}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "include_archived": True,
+                "workflow_variant": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflow_variants"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -211,16 +243,22 @@ def test_query_paginated_workflow_variants(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert (
-            response["workflow_variants"][0]["id"]
-            == mock_data["workflow_variants"][1]["id"]
-        )
+        seen_ids.add(response["workflow_variants"][0]["id"])
+        assert seen_ids == expected_ids
         # ----------------------------------------------------------------------
 
+        # ACT — page 3 (empty) ------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/variants/?include_archived=true"
-            f"&limit=1&next={response['workflow_variants'][0]['id']}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "include_archived": True,
+                "workflow_variant": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflow_variants"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -235,49 +273,18 @@ def test_query_workflow_variants_by_flags(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
-        flags = quote(dumps(mock_data["workflow_variants"][0]["flags"]))
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert (
-            response["workflow_variants"][0]["id"]
-            == mock_data["workflow_variants"][0]["id"]
-        )
-        # ----------------------------------------------------------------------
-
-        # ACT ------------------------------------------------------------------
-        flags = quote(dumps({"is_custom": True}))
-
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 0
-        # ----------------------------------------------------------------------
+        marker = mock_data["_marker"]
 
-    def test_query_workflow_variants_by_tags(
-        self,
-        authed_api,
-        mock_data,
-    ):
         # ACT ------------------------------------------------------------------
-        tags = quote(dumps(mock_data["workflow_variants"][0]["tags"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?tags={tags}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "workflow_variant": {
+                    "flags": mock_data["workflow_variants"][0]["flags"],
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -292,11 +299,15 @@ def test_query_workflow_variants_by_tags(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        tags = quote(dumps({"tag1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?tags={tags}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "workflow_variant": {
+                    "flags": {"is_custom": True},
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -306,16 +317,20 @@ def test_query_workflow_variants_by_tags(
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_query_workflow_variants_by_meta(
+    def test_query_workflow_variants_by_tags(
         self,
         authed_api,
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps(mock_data["workflow_variants"][0]["meta"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?meta={meta}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "workflow_variant": {
+                    "tags": mock_data["workflow_variants"][0]["tags"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -330,11 +345,14 @@ def test_query_workflow_variants_by_meta(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps({"meta1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/variants/?meta={meta}",
+            "POST",
+            "/preview/workflows/variants/query",
+            json={
+                "workflow_variant": {
+                    "tags": {"tag1": "nonexistent_value"},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
diff --git a/api/oss/tests/pytest/workflows/test_workflows_basics.py b/api/oss/tests/pytest/e2e/workflows/test_workflows_basics.py
similarity index 100%
rename from api/oss/tests/pytest/workflows/test_workflows_basics.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflows_basics.py
diff --git a/api/oss/tests/pytest/workflows/test_workflows_queries.py b/api/oss/tests/pytest/e2e/workflows/test_workflows_queries.py
similarity index 64%
rename from api/oss/tests/pytest/workflows/test_workflows_queries.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflows_queries.py
index 91de434e0b..89badf31ce 100644
--- a/api/oss/tests/pytest/workflows/test_workflows_queries.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflows_queries.py
@@ -1,6 +1,4 @@
 from uuid import uuid4
-from json import dumps
-from urllib.parse import quote
 
 import pytest
 
@@ -8,6 +6,9 @@
 @pytest.fixture(scope="class")
 def mock_data(authed_api):
     # ARRANGE --------------------------------------------------------------
+    # Use unique tag values to isolate from default evaluators
+    unique_marker = uuid4().hex[:8]
+
     workflow_slug = uuid4()
 
     workflow = {
@@ -23,11 +24,13 @@ def mock_data(authed_api):
             "tag1": "value1",
             "tag2": "value2",
             "tag3": "value3",
+            "_marker": unique_marker,
         },
         "meta": {
             "meta1": "value1",
             "meta2": "value2",
             "meta3": "value3",
+            "_marker": unique_marker,
         },
     }
 
@@ -39,7 +42,7 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_id_0 = response.json()["workflow"]["id"]
+    workflow_0 = response.json()["workflow"]
 
     workflow_slug = uuid4()
 
@@ -56,11 +59,13 @@ def mock_data(authed_api):
             "tag1": "value1",
             "tag2": "2value",
             "tag3": "value3",
+            "_marker": unique_marker,
         },
         "meta": {
             "meta1": "value1",
             "meta2": "2value",
             "meta3": "value3",
+            "_marker": unique_marker,
         },
     }
 
@@ -72,29 +77,40 @@ def mock_data(authed_api):
 
     assert response.status_code == 200
 
-    workflow_id_1 = response.json()["workflow"]["id"]
+    workflow_1 = response.json()["workflow"]
 
     response = authed_api(
         "POST",
-        f"/preview/workflows/{workflow_id_1}/archive",
+        f"/preview/workflows/{workflow_1['id']}/archive",
     )
 
     assert response.status_code == 200
 
+    # Verify with marker-scoped query
     response = authed_api(
-        "GET",
-        "/preview/workflows/?include_archived=true",
+        "POST",
+        "/preview/workflows/query",
+        json={
+            "include_archived": True,
+            "workflow": {"tags": {"_marker": unique_marker}},
+        },
     )
 
     assert response.status_code == 200
     response = response.json()
 
     assert response["count"] == 2
-    assert response["workflows"][0]["id"] == workflow_id_0
-    assert response["workflows"][1]["id"] == workflow_id_1
+    workflow_ids = {w["id"] for w in response["workflows"]}
+    assert workflow_0["id"] in workflow_ids
+    assert workflow_1["id"] in workflow_ids
     # --------------------------------------------------------------------------
 
-    return response
+    _mock_data = {
+        "workflows": [workflow_0, workflow_1],
+        "_marker": unique_marker,
+    }
+
+    return _mock_data
 
 
 class TestWorkflowsQueries:
@@ -105,8 +121,11 @@ def test_query_non_archived_workflows(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "workflow": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -124,8 +143,12 @@ def test_query_all_workflows(
     ):
         # ACT ------------------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/?include_archived=true",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "include_archived": True,
+                "workflow": {"tags": {"_marker": mock_data["_marker"]}},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -133,7 +156,9 @@ def test_query_all_workflows(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 2
-        assert response["workflows"][0]["id"] == mock_data["workflows"][0]["id"]
+        workflow_ids = {w["id"] for w in response["workflows"]}
+        assert mock_data["workflows"][0]["id"] in workflow_ids
+        assert mock_data["workflows"][1]["id"] in workflow_ids
         # ----------------------------------------------------------------------
 
     def test_query_paginated_workflows(
@@ -141,10 +166,18 @@ def test_query_paginated_workflows(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
+        marker = mock_data["_marker"]
+        expected_ids = {w["id"] for w in mock_data["workflows"]}
+
+        # ACT — page 1 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/?include_archived=true&limit=1",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "include_archived": True,
+                "workflow": {"tags": {"_marker": marker}},
+                "windowing": {"limit": 1},
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -152,14 +185,21 @@ def test_query_paginated_workflows(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["workflows"][0]["id"] == mock_data["workflows"][0]["id"]
+        seen_ids = {response["workflows"][0]["id"]}
         # ----------------------------------------------------------------------
 
-        # ACT ------------------------------------------------------------------
+        # ACT — page 2 --------------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/?include_archived=true"
-            f"&limit=1&next={response['workflows'][0]['id']}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "include_archived": True,
+                "workflow": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflows"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -167,13 +207,22 @@ def test_query_paginated_workflows(
         assert response.status_code == 200
         response = response.json()
         assert response["count"] == 1
-        assert response["workflows"][0]["id"] == mock_data["workflows"][1]["id"]
+        seen_ids.add(response["workflows"][0]["id"])
+        assert seen_ids == expected_ids
         # ----------------------------------------------------------------------
 
+        # ACT — page 3 (empty) ------------------------------------------------
         response = authed_api(
-            "GET",
-            "/preview/workflows/?include_archived=true"
-            f"&limit=1&next={response['workflows'][0]['id']}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "include_archived": True,
+                "workflow": {"tags": {"_marker": marker}},
+                "windowing": {
+                    "limit": 1,
+                    "next": response["workflows"][0]["id"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -188,46 +237,18 @@ def test_query_workflows_by_flags(
         authed_api,
         mock_data,
     ):
-        # ACT ------------------------------------------------------------------
-        flags = quote(dumps(mock_data["workflows"][0]["flags"]))
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 1
-        assert response["workflows"][0]["id"] == mock_data["workflows"][0]["id"]
-        # ----------------------------------------------------------------------
+        marker = mock_data["_marker"]
 
         # ACT ------------------------------------------------------------------
-        flags = quote(dumps({"is_custom": True}))
-
-        response = authed_api(
-            "GET",
-            f"/preview/workflows/?flags={flags}",
-        )
-        # ----------------------------------------------------------------------
-
-        # ASSERT ---------------------------------------------------------------
-        assert response.status_code == 200
-        response = response.json()
-        assert response["count"] == 0
-        # ----------------------------------------------------------------------
-
-    def test_query_workflows_by_tags(
-        self,
-        authed_api,
-        mock_data,
-    ):
-        # ACT ------------------------------------------------------------------
-        tags = quote(dumps(mock_data["workflows"][0]["tags"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/?tags={tags}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "workflow": {
+                    "flags": mock_data["workflows"][0]["flags"],
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -239,11 +260,15 @@ def test_query_workflows_by_tags(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        tags = quote(dumps({"tag1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/?tags={tags}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "workflow": {
+                    "flags": {"is_custom": True},
+                    "tags": {"_marker": marker},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -253,16 +278,20 @@ def test_query_workflows_by_tags(
         assert response["count"] == 0
         # ----------------------------------------------------------------------
 
-    def test_query_workflows_by_meta(
+    def test_query_workflows_by_tags(
         self,
         authed_api,
         mock_data,
     ):
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps(mock_data["workflows"][0]["meta"]))
         response = authed_api(
-            "GET",
-            f"/preview/workflows/?meta={meta}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "workflow": {
+                    "tags": mock_data["workflows"][0]["tags"],
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
@@ -274,11 +303,14 @@ def test_query_workflows_by_meta(
         # ----------------------------------------------------------------------
 
         # ACT ------------------------------------------------------------------
-        meta = quote(dumps({"meta1": "value2"}))
-
         response = authed_api(
-            "GET",
-            f"/preview/workflows/?meta={meta}",
+            "POST",
+            "/preview/workflows/query",
+            json={
+                "workflow": {
+                    "tags": {"tag1": "nonexistent_value"},
+                },
+            },
         )
         # ----------------------------------------------------------------------
 
diff --git a/api/oss/tests/pytest/workflows/test_workflows_retrieve.py b/api/oss/tests/pytest/e2e/workflows/test_workflows_retrieve.py
similarity index 89%
rename from api/oss/tests/pytest/workflows/test_workflows_retrieve.py
rename to api/oss/tests/pytest/e2e/workflows/test_workflows_retrieve.py
index 6ad36e577e..2e60a96f83 100644
--- a/api/oss/tests/pytest/workflows/test_workflows_retrieve.py
+++ b/api/oss/tests/pytest/e2e/workflows/test_workflows_retrieve.py
@@ -199,7 +199,10 @@ def test_retrieve_by_revision_id(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve?workflow_revision_id={revision_id}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_revision_ref": {"id": revision_id},
+            },
         )
 
         assert response.status_code == 200
@@ -218,8 +221,10 @@ def test_retrieve_by_revision_slug(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve"
-            f"?workflow_revision_slug={revision_slug}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_revision_ref": {"slug": revision_slug},
+            },
         )
 
         assert response.status_code == 200
@@ -241,9 +246,11 @@ def test_retrieve_by_variant_id_revision_version(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve"
-            f"?workflow_variant_id={variant_id}"
-            f"&workflow_revision_version={revision_version}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_variant_ref": {"id": variant_id},
+                "workflow_revision_ref": {"version": revision_version},
+            },
         )
 
         assert response.status_code == 200
@@ -265,9 +272,11 @@ def test_retrieve_by_variant_slug_revision_version(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve"
-            f"?workflow_variant_slug={variant_slug}"
-            f"&workflow_revision_version={revision_version}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_variant_ref": {"slug": variant_slug},
+                "workflow_revision_ref": {"version": revision_version},
+            },
         )
 
         assert response.status_code == 200
@@ -286,7 +295,10 @@ def test_retrieve_by_variant_id(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve?workflow_variant_id={variant_id}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_variant_ref": {"id": variant_id},
+            },
         )
 
         assert response.status_code == 200
@@ -305,8 +317,10 @@ def test_retrieve_by_variant_slug(self, authed_api, mock_data):
 
         response = authed_api(
             "GET",
-            f"/preview/workflows/revisions/retrieve"
-            f"?workflow_variant_slug={variant_slug}",
+            "/preview/workflows/revisions/retrieve",
+            json={
+                "workflow_variant_ref": {"slug": variant_slug},
+            },
         )
 
         assert response.status_code == 200
diff --git a/api/oss/tests/pytest/utils/accounts.py b/api/oss/tests/pytest/utils/accounts.py
index 57b7b2a1a5..a83ff771a4 100644
--- a/api/oss/tests/pytest/utils/accounts.py
+++ b/api/oss/tests/pytest/utils/accounts.py
@@ -1,6 +1,8 @@
 import requests
 import pytest
 
+from uuid import uuid4
+
 from utils.constants import BASE_TIMEOUT
 
 
@@ -11,9 +13,16 @@ def create_account(ag_env):
     headers = {"Authorization": f"Access {auth_key}"}
     url = f"{api_url}/admin/account"
 
+    unique_id = uuid4().hex[:12]
+
     response = requests.post(
         url=url,
         headers=headers,
+        json={
+            "user": {
+                "email": f"{unique_id}@test.agenta.ai",
+            },
+        },
         timeout=BASE_TIMEOUT,
     )
 
diff --git a/api/pytest.ini b/api/pytest.ini
index b8cc765e0c..0db346b2ca 100644
--- a/api/pytest.ini
+++ b/api/pytest.ini
@@ -23,4 +23,8 @@ markers =
     case_typical: likely behavior
     case_edge: unlikely behavior
     speed_fast: ~ milliseconds
-    speed_slow: ~ seconds
\ No newline at end of file
+    speed_slow: ~ seconds
+    license_oss: OSS license scope
+    license_ee: EE license scope
+    cost_free: no monetary cost (local/internal services)
+    cost_paid: uses paid third-party services (LLM APIs)
\ No newline at end of file
diff --git a/docs/designs/testing/README.md b/docs/designs/testing/README.md
new file mode 100644
index 0000000000..fe07f5f712
--- /dev/null
+++ b/docs/designs/testing/README.md
@@ -0,0 +1,128 @@
+# Testing
+
+This directory specifies the testing strategy for the Agenta monorepo, covering all system interfaces: API, SDK, Web, Services, and Docs. The strategy uses orthogonal documents: principles describe the philosophy, boundaries describe architectural layers, dimensions describe filtering, structure describes folder layout, and interface documents describe per-component specifics.
+
+---
+
+## Quick Reference
+
+### Core Specifications
+
+| Document | Description |
+|----------|-------------|
+| [testing.principles.specs.md](testing.principles.specs.md) | Philosophy, test pyramid, tradeoffs, mocking approach |
+| [testing.boundaries.specs.md](testing.boundaries.specs.md) | Architectural test layers and what to test at each |
+| [testing.dimensions.specs.md](testing.dimensions.specs.md) | Unified marker/tag taxonomy across all runners |
+| [testing.structure.specs.md](testing.structure.specs.md) | Folder layout, file types, naming conventions |
+
+### Interface Specifications
+
+| Document | Description |
+|----------|-------------|
+| [testing.interfaces.specs.md](testing.interfaces.specs.md) | Overview of all system interfaces and testing matrix |
+| [testing.interface.api.specs.md](testing.interface.api.specs.md) | API testing: current state, targets, mocking guidance |
+| [testing.interface.sdk.specs.md](testing.interface.sdk.specs.md) | SDK testing: unit, integration, smoke |
+| [testing.interface.web.specs.md](testing.interface.web.specs.md) | Web testing: Playwright E2E, data layer, component unit |
+
+### Supporting Documents
+
+| Document | Description |
+|----------|-------------|
+| [testing.fixtures.specs.md](testing.fixtures.specs.md) | Shared test infrastructure, accounts, helpers, scoping |
+| [testing.running.specs.md](testing.running.specs.md) | How to run tests: local, cloud, CI |
+| [testing.initial.specs.md](testing.initial.specs.md) | Original discussion-format spec (preserved as reference) |
+
+---
+
+## Status Matrix
+
+Test folder structure is now **standardized** across all components with `manual/`, `legacy/`, and `pytest/`|`playwright/` containing `e2e/`, `unit/`, and `utils/` subdirectories.
+
+| Component | Unit Tests | E2E Tests | Manual Tests | CI |
+|-----------|-----------|-----------|--------------|-----|
+| **API** | Structure ready (.gitkeep) | ✅ 155 tests across 7 domains | ✅ HTTP files, scripts | Linting only |
+| **SDK** | ✅ 22 tests (tracing decorators) | ✅ 66 tests (SDK against live API) | ✅ Workflow tests, imports | Linting only |
+| **Web** | ✅ Jotai atom tests (colocated) | ✅ Playwright feature suites | ✅ Data layer tests (manual) | Linting only |
+| **Services** | Structure ready (.gitkeep) | Structure ready (.gitkeep) | ✅ smoke.http | N/A |
+| **Docs** | N/A | Planned (link checking, build) | N/A | N/A |
+
+---
+
+## Quick Start: Running Tests
+
+### API Tests
+
+```bash
+# Run all E2E tests
+cd api
+AGENTA_API_URL=http://localhost:10180/api AGENTA_AUTH_KEY=change-me-auth \
+  python -m pytest oss/tests/pytest/ -v
+
+# Run smoke tests only (fast subset)
+python -m pytest oss/tests/pytest/ -v -m coverage_smoke
+
+# Run specific domain
+python -m pytest oss/tests/pytest/e2e/workflows/ -v
+
+# Run with dimension filters
+python -m pytest oss/tests/pytest/ -v -m "coverage_smoke and path_happy"
+python -m pytest oss/tests/pytest/ -v -m "cost_free"  # Exclude paid tests
+```
+
+### SDK Tests
+
+```bash
+# Run all tests (unit + E2E)
+cd sdk
+AGENTA_API_URL=http://localhost:10180/api AGENTA_AUTH_KEY=change-me-auth \
+  poetry run pytest tests/pytest/ -v
+
+# Run unit tests only (no external deps)
+poetry run pytest tests/pytest/unit/ -v
+
+# Run E2E tests only (requires running API)
+poetry run pytest tests/pytest/e2e/ -v -m e2e
+
+# Run with dimension filters
+poetry run pytest tests/pytest/e2e/ -v -m "coverage_smoke and cost_free"
+```
+
+### Web Tests
+
+```bash
+cd web/tests
+
+# Run smoke tests (OSS) - AGENTA_LICENSE not needed when path is explicit
+AGENTA_WEB_URL=http://localhost:10180 \
+TESTMAIL_NAMESPACE=<your-namespace> \
+TESTMAIL_API_KEY=<your-key> \
+  npx playwright test ../oss/tests/playwright/e2e/smoke.spec.ts
+
+# Run smoke tests (EE)
+AGENTA_WEB_URL=http://localhost:10180 \
+TESTMAIL_NAMESPACE=<your-namespace> \
+TESTMAIL_API_KEY=<your-key> \
+  npx playwright test ../ee/tests/playwright/e2e/smoke.spec.ts
+
+# Run all E2E tests for a specific feature (OSS)
+npx playwright test ../oss/tests/playwright/e2e/settings/
+
+# Run with tag filters (requires AGENTA_LICENSE when using default testDir)
+AGENTA_LICENSE=oss npx playwright test --grep "@coverage:smoke"
+AGENTA_LICENSE=oss npx playwright test --grep "@coverage:smoke.*@cost:free"
+```
+
+**Note:** Web tests require valid TESTMAIL credentials. See [web/tests/playwright.config.ts](../../web/tests/playwright.config.ts) for configuration details.
+
+---
+
+## Related In-Tree Documentation
+
+| Location | Description |
+|----------|-------------|
+| `web/tests/guides/` | Playwright E2E guides (generation, organization, fixtures, recording) |
+| `sdk/tests/unit/README.md` | SDK unit test quick start |
+| `sdk/tests/unit/TESTING_PATTERNS.md` | SDK testing patterns and approaches |
+| `web/tests/playwright/config/testTags.ts` | Web test tag definitions |
+| `api/pytest.ini` | API pytest configuration and markers |
+| `sdk/pytest.ini` | SDK pytest configuration and markers |
diff --git a/docs/designs/testing/testing.boundaries.specs.md b/docs/designs/testing/testing.boundaries.specs.md
new file mode 100644
index 0000000000..1dc2c3e4c6
--- /dev/null
+++ b/docs/designs/testing/testing.boundaries.specs.md
@@ -0,0 +1,271 @@
+# Testing Boundaries
+
+Boundaries describe *where* in the architecture a test lives -- which layer it exercises and what it isolates. Each boundary defines what is under test, what is mocked or faked, and what assertions are appropriate.
+
+This document is interface-agnostic. For how boundaries apply to a specific interface, see the per-interface specs ([API](testing.interface.api.specs.md), [SDK](testing.interface.sdk.specs.md), [Web](testing.interface.web.specs.md)).
+
+---
+
+## Folder structure and boundaries
+
+The standardized test folder structure maps to architectural boundaries:
+
+```
+tests/
+  manual/                    # Can test any boundary, not automated
+  legacy/                    # Archived, not run
+  pytest/ or playwright/
+    e2e/                     # Boundary 5: E2E/system (black box)
+    unit/                    # Boundaries 1-4: Architectural layers (white box)
+      utils/                 # Boundary 1: Pure functions
+      core/                  # Boundary 2: Business logic with mocked ports
+      adapters/
+        db/                  # Boundary 3: DAO with mocked session
+        http/                # Boundary 4: HTTP with in-process client
+    utils/                   # Shared fixtures + library/tool tests
+```
+
+### Folder semantics and boundaries
+
+| Folder | Boundary coverage | Testing mode | Purpose |
+|--------|------------------|--------------|---------|
+| `e2e/` | Boundary 5 only | Black box, system running | Full integration across all layers |
+| `unit/` | Boundaries 1-4 | White box, system NOT running | Layer isolation with dependency injection |
+| `utils/` | Mixed | White box | Shared test fixtures + library/tool tests (boundary unclear) |
+| `manual/` | Any boundary | Freestyle | Developer reference, not automated, can test any layer |
+
+### manual/ folder organization by domain
+
+The `manual/` folder has no fixed substructure but commonly organizes by domain or feature. Examples across interfaces:
+
+**API manual tests** (`api/oss/tests/manual/`):
+- `annotations/crud.http` -- Annotation CRUD operations
+- `auth/admin.http` -- Admin account creation
+- `evaluations/*.http` -- Evaluation flows
+- `testsets/*.http` -- Testset operations, testcase inclusion
+- `tracing/*.http` -- Trace ingestion, filtering, windowing
+- `workflows/*.http` -- Workflow artifacts, revisions, variants
+
+**SDK manual tests** (`sdk/tests/manual/`):
+- `imports/*.py` -- Import and initialization tests
+- `workflows/*.py` -- SDK workflow testing
+- `tools/*.py` -- Tool invocation and schema validation
+
+**Web manual tests** (`web/oss/tests/manual/`):
+- `datalayer/*.ts` -- Data layer integration tests (Jotai atoms against live API)
+
+**Services manual tests** (`services/oss/tests/manual/`):
+- `smoke.http` -- Basic service health check
+
+Manual tests may exercise any boundary (pure utils, business logic, full E2E) but are not automated. They serve as developer reference for reproducing scenarios, testing flows, or validating behavior during development.
+
+---
+
+## 1. Utils/helpers (pure unit)
+
+**Folder location:** `pytest/unit/utils/` or colocated with source (Web component tests)
+
+**What belongs here:**
+- Parsing and formatting utilities (IDs, dates, pagination tokens).
+- Validators and normalizers.
+- Deterministic encoding and serialization (flatten/unflatten, safe encoders).
+- Hashing helpers.
+- Small algorithms used by Core or adapters.
+- Error mapping utilities that are not bound to SQLAlchemy or HTTP specifics.
+
+**How to test:**
+- Direct function calls.
+- Table-driven tests (`pytest.mark.parametrize` / `test.each`).
+- (Optional) Property-based tests for parsers and encoders.
+
+**Test doubles:** None needed.
+
+**Assertions:** Input to output equality.
+
+**Tradeoffs:**
+- Fastest tests, highest signal, easy to cover edge cases.
+- Avoid testing trivial wrappers around libraries unless they encode business rules.
+- Do not create brittle tests that lock in implementation details.
+
+---
+
+## 2. Core services (unit, mock ports)
+
+**Folder location:** `pytest/unit/core/`
+
+**What to test:**
+- Invariants and state transitions.
+- Orchestration across ports (repo/DAO, clock, ID generator, event bus, external clients).
+- Domain-level error mapping (e.g., `AlreadyExists`, `NotFound`).
+- Idempotency logic.
+- Emitted domain events or commands (if applicable).
+
+**What to inject:**
+- Fake or mock implementation of each DAO interface (port).
+- Fake clock, fake ID generator where relevant.
+
+**Preference: fakes over mocks.** Fakes are preferred when Core behavior depends on persistence state (create-then-fetch, idempotency, sequences). Mocks are preferred when verifying interactions only (called once, called with specific args).
+
+**Assertions:**
+- Return values match expected domain objects.
+- Side effects occurred (port methods called with correct args).
+- Domain errors raised for invalid states.
+
+**Tradeoffs:**
+- Isolates Core perfectly; extremely fast and stable.
+- Focuses on business logic and contracts.
+- Correctness of SQL queries is NOT validated here (by design).
+- If Core leaks adapter concerns (SQLAlchemy models or sessions), test isolation breaks.
+
+---
+
+## 3. Adapters -- outbound/DB (unit, mock session)
+
+**Folder location:** `pytest/unit/adapters/db/`
+
+**The seam to mock:**
+Even though DAOs receive an engine at construction time, the clean unit-test boundary is `AsyncSession` (or `async_sessionmaker`), not the engine.
+
+**Why AsyncSession, not engine:**
+- DAOs call `session.execute(...)`, `session.commit()`, etc.
+- Engine mocking pushes into internal plumbing (connections, pooling, begin blocks), which is brittle.
+- Mocking sessions answers "did the DAO send the right request?" without running a database.
+
+**What to test:**
+- Statement construction (SQLAlchemy statement shape).
+- Bound parameters (values, required params present).
+- Call sequence (execute, commit, rollback if the DAO controls it).
+- Row-to-domain mapping (DBE to DTO).
+- Exception mapping: SQLAlchemy/driver exceptions to domain persistence errors.
+
+**Two assertion styles:**
+
+1. **Fake session records calls** -- Assert that `execute()` was called with a statement and params matching expectations.
+2. **Compile statement using Postgres dialect** -- Compile the SQLAlchemy statement with `postgresql.dialect()`, then assert on SQL fragments and compiled params. Avoid exact-string SQL comparisons to reduce brittleness.
+
+**Tradeoffs:**
+- Fast and deterministic.
+- Verifies adapter request construction and mapping logic.
+- Enforces the adapter-to-port contract at unit level.
+- Cannot validate real Postgres semantics: JSONB operators, ON CONFLICT behavior, type casting, locks, query planner.
+- May go "green" while Postgres rejects the query in reality.
+- The E2E suite becomes the only semantic safety net for database behavior.
+
+This is the explicit tradeoff accepted by skipping adapter integration tests.
+
+---
+
+## 4. Adapters -- inbound/HTTP (unit, in-process)
+
+**Folder location:** `pytest/unit/adapters/http/`
+
+**How to test:**
+- Build a FastAPI app with routes mounted.
+- Override dependencies to inject mocked Core services.
+- Use `httpx.AsyncClient` or FastAPI `TestClient` to call endpoints in-process (no running server).
+
+**What to test:**
+- Request parsing and validation (422 for malformed input).
+- Status codes and response shapes (200, 201, 404, 409, etc.).
+- Error mapping at the HTTP boundary (domain errors to HTTP status and body).
+- Auth boundary behaviors (if implemented in router or middleware).
+- Pagination inputs and outputs.
+- Content negotiation (JSON, file uploads, etc.).
+
+**Test doubles:** Mocked Core services injected via FastAPI dependency overrides.
+
+**Tradeoffs:**
+- No server process, fast feedback.
+- Protects API contract and translation logic.
+- Does not validate full wiring with DAOs (by design).
+- Cannot validate actual network stack behavior (TLS, reverse proxy headers).
+
+---
+
+## 5. E2E/system (real dependencies)
+
+**Folder location:** `pytest/e2e/` or `playwright/e2e/`
+
+**Testing mode:** Black box. System is running. Tests only interact with public surfaces (API URLs, Web URLs) using credentials.
+
+Since adapter integration tests are skipped, E2E is the only "real dependency" validation.
+
+**What E2E must validate (because nothing else will):**
+1. Wiring across layers: routers to core to DAO to database.
+2. Postgres semantics that mocks cannot catch:
+   - Constraints (unique, foreign key).
+   - Transactionality and rollbacks.
+   - Postgres-specific features: JSONB, full-text search, ON CONFLICT, RETURNING.
+   - Driver error shapes and mapping correctness.
+
+**Scope:**
+A minimal E2E suite that pays for itself:
+- Happy-path CRUD for key entities.
+- Constraint case (unique violation to correct error mapping).
+- Transaction case (force mid-operation failure to ensure rollback).
+- Idempotency or concurrency case (if relevant).
+
+**How to run:**
+- Spin a real Postgres instance (docker-compose or testcontainers).
+- Run migrations.
+- Run the FastAPI app (either in-process ASGI client with real DI wiring, or as a process called over HTTP).
+
+**Examples across interfaces:**
+- **API E2E** (`api/oss/tests/pytest/e2e/`): HTTP requests to API endpoints, organized by domain (workflows, evaluations, testsets, etc.)
+- **SDK E2E** (`sdk/tests/pytest/e2e/`): SDK client calls against live API (workflows, evaluations, observability)
+- **Web E2E** (`web/oss/tests/playwright/e2e/`): Playwright browser tests against running web app (settings, app, playground, etc.)
+
+---
+
+## 6. The utils/ folder: dual purpose
+
+**Folder location:** `pytest/utils/` or `playwright/utils/`
+
+The `utils/` folder serves two distinct purposes:
+
+### 6.1. Shared test fixtures (primary use)
+
+Test infrastructure shared by `e2e/` and `unit/` tests:
+- **Fixture modules** -- pytest fixtures, Playwright helpers
+- **Account management** -- Test account creation and cleanup
+- **API clients** -- Authenticated/unauthenticated HTTP clients
+- **Test constants** -- Timeouts, base URLs, environment variables
+
+**Examples:**
+- `api/oss/tests/pytest/utils/api.py` -- `authed_api`, `unauthed_api` fixtures
+- `api/oss/tests/pytest/utils/accounts.py` -- `cls_account`, `mod_account`, `foo_account` fixtures
+- `sdk/tests/pytest/utils/sdk.py` -- SDK client fixtures
+- `web/tests/playwright/utils/` -- Playwright utility helpers (currently `.gitkeep` placeholder)
+
+### 6.2. Library and tool tests (secondary use)
+
+Tests for **libraries, tools, and helper functions** that the system uses but that aren't part of the system's core business logic:
+- Shared validation libraries
+- Internal benchmark utilities
+- Helper functions with edge cases
+- Infrastructure tooling
+
+**Boundary ambiguity:** There's a gray line between `unit/utils/` (pure business utilities, Boundary 1) and `utils/` (tooling utilities). When in doubt:
+- If it's business domain logic → `unit/utils/`
+- If it's infrastructure/tooling → `utils/`
+
+**Current state:** Most `utils/` folders currently contain only shared fixtures. Library/tool tests may be added as needed.
+
+---
+
+## 7. What NOT to test at unit level
+
+The following are explicitly excluded from unit-level test infrastructure:
+
+- A running Postgres instance.
+- A running web server process.
+- Any "fake Postgres server" or database emulator.
+- SQLite in-memory as a substitute for Postgres.
+
+**Why SQLite in-memory does not help:**
+- Core tests should depend on ports (interfaces), not SQL adapters. SQLite introduces an adapter dependency into what should be a pure unit test.
+- If the DAO is mocked, SQLite is redundant.
+- If the DAO is not mocked, the test is no longer "Core only" -- it tests a persistence adapter too.
+- SQLite and Postgres have different SQL dialects, type systems, and constraint behaviors. A passing SQLite test provides false confidence about Postgres behavior.
+
+For Core unit tests, prefer in-memory fake implementations of the DAO port (pure Python).
diff --git a/docs/designs/testing/testing.dimensions.specs.md b/docs/designs/testing/testing.dimensions.specs.md
new file mode 100644
index 0000000000..b1e9375fe3
--- /dev/null
+++ b/docs/designs/testing/testing.dimensions.specs.md
@@ -0,0 +1,165 @@
+# Testing Dimensions
+
+## Concept
+
+Dimensions are orthogonal classification axes applied to tests. They enable
+selective test execution via CLI flags or markers. Each dimension is independent
+of the others -- a test may carry any combination of dimension markers.
+
+Dimensions are independent of boundaries. A test at any boundary (unit,
+integration, E2E) can carry dimension markers, though in practice dimensions are
+applied primarily to E2E tests. Unit tests generally do not need dimensions.
+
+## Shared dimensions
+
+These dimensions are common across all three runners (API, SDK, Web). Some dimensions have interface-specific values.
+
+| Dimension | Values | Semantics |
+| --------- | ------ | --------- |
+| coverage | `smoke`, `full` (API/SDK); `smoke`, `sanity`, `light`, `full` (Web) | Breadth and depth of testing. `smoke` is breadth over depth; `full` is breadth and depth. Web adds `sanity` (narrow breadth, deep depth) and `light` (smoke + sanity). |
+| path | `happy`, `grumpy` | Desired behavior vs undesired behavior (error states, invalid inputs). |
+| case | `typical`, `edge` | Likely scenarios vs unlikely scenarios. |
+| lens | `functional`, `performance`, `security` | The quality attribute under test: correctness, latency, or security posture. |
+| speed | `fast`, `slow` | Expected duration. `fast` targets millisecond-scale execution; `slow` targets second-scale execution. |
+| cost | `free`, `paid` | Whether the test incurs monetary costs. `free` = purely code execution (local services, internal APIs, free services). `paid` = uses paid third-party services (LLM APIs, external APIs with usage costs). |
+| role | `owner`, `admin`, `editor`, `viewer` | The user permission level under which the test executes. API/SDK include `admin` role; Web uses `owner`, `editor`, `viewer`. |
+| plan | `hobby`, `pro`, `business`, `enterprise` | The organization plan level under which the test executes. API/SDK include all tiers; Web typically uses `hobby`, `pro`. |
+| license | `oss`, `ee` | License scope. **Dual usage:** (1) Structural organization via folder paths (`oss/tests/` vs `ee/tests/`) for local test organization; (2) Explicit markers/tags for filtering when testing against remote environments where folder structure doesn't indicate the remote server's license. |
+| scope | Interface-specific values | The functional area or domain of the application under test. Web: `auth`, `apps`, `playground`, `datasets`, `evaluations`, `settings`, `deployment`, `observability`. API/SDK: Handled via directory structure (e.g., `workflows/`, `evaluations/`) rather than explicit markers. |
+
+## Syntax mapping
+
+### Pytest (API/SDK)
+
+Markers follow the pattern `@pytest.mark.{dimension}_{value}`.
+
+```python
+@pytest.mark.coverage_smoke
+@pytest.mark.path_happy
+@pytest.mark.lens_functional
+@pytest.mark.speed_fast
+@pytest.mark.cost_free
+@pytest.mark.license_oss
+def test_create_workflow():
+    ...
+```
+
+Example with EE-only feature:
+
+```python
+@pytest.mark.coverage_smoke
+@pytest.mark.path_happy
+@pytest.mark.lens_functional
+@pytest.mark.cost_free
+@pytest.mark.license_ee
+def test_workspace_management():
+    ...
+```
+
+Example with paid third-party service (LLM API):
+
+```python
+@pytest.mark.coverage_smoke
+@pytest.mark.path_happy
+@pytest.mark.lens_functional
+@pytest.mark.cost_paid  # Uses OpenAI API
+@pytest.mark.license_oss
+def test_llm_generation():
+    ...
+```
+
+CLI filtering uses the `-m` flag with marker expressions:
+
+```bash
+pytest -m coverage_smoke
+pytest -m "coverage_smoke and path_happy"
+pytest -m "coverage_smoke and lens_functional and speed_fast"
+pytest -m "cost_free"  # Run only free tests
+pytest -m "not cost_paid"  # Exclude tests that cost money
+pytest -m "license_oss"  # Run only OSS tests (e.g., against remote OSS server)
+pytest -m "license_ee"  # Run only EE tests (e.g., against remote EE server)
+```
+
+### Playwright (Web)
+
+Tags follow the pattern `@{dimension}:{value}`.
+
+```typescript
+test("create app @coverage:smoke @path:happy @lens:functional @speed:fast @cost:free @license:oss", async () => {
+    ...
+})
+```
+
+Example with EE-only feature:
+
+```typescript
+test("manage workspace @coverage:smoke @path:happy @lens:functional @cost:free @license:ee", async () => {
+    ...
+})
+```
+
+Example with paid third-party service (LLM API):
+
+```typescript
+test("generate with LLM @coverage:smoke @path:happy @lens:functional @cost:paid @license:oss", async () => {
+    // Test that calls OpenAI/Anthropic/etc API
+    ...
+})
+```
+
+CLI filtering uses dimension-specific flags:
+
+```bash
+npx playwright test -coverage smoke
+npx playwright test -coverage smoke -path happy
+npx playwright test -coverage smoke -lens functional -speed fast
+npx playwright test -cost free  # Run only free tests
+npx playwright test -license oss  # Run only OSS tests (e.g., against remote OSS server)
+npx playwright test -license ee  # Run only EE tests (e.g., against remote EE server)
+```
+
+The full tag syntax mapping from `testTags.ts`:
+
+| Dimension | CLI flag | Tag prefix |
+| --------- | -------- | ---------- |
+| scope | `-scope` | `@scope:` |
+| coverage | `-coverage` | `@coverage:` |
+| path | `-path` | `@path:` |
+| plan | `-plan` | `@plan:` |
+| role | `-role` | `@role:` |
+| lens | `-lens` | `@lens:` |
+| case | `-case` | `@case:` |
+| speed | `-speed` | `@speed:` |
+| license | `-license` | `@license:` |
+| cost | `-cost` | `@cost:` |
+
+## Usage guidelines
+
+- Apply dimension markers to E2E tests. Unit tests generally do not need dimensions.
+- Every E2E test should have at minimum: `coverage`, `path`, `lens`, and `cost` markers.
+- Use `coverage_smoke` / `@coverage:smoke` for the smallest set that validates basic functionality.
+- Use `path_happy` / `@path:happy` for expected flows, `path_grumpy` / `@path:grumpy` for error states and invalid inputs.
+- **Always mark `cost`** -- `cost_free` / `@cost:free` for tests that only use local/internal services, `cost_paid` / `@cost:paid` for tests that call paid third-party APIs (LLMs, external services with usage costs).
+- **Mark `license`** when the test is specific to a license level -- `license_oss` / `@license:oss` for OSS-only features, `license_ee` / `@license:ee` for EE-only features. Use these markers to filter when testing against remote environments.
+- Combine dimensions to build targeted test suites:
+  - `"smoke happy functional fast free"` -- Fast CI gate without costs
+  - `"coverage_smoke and cost_free"` -- Quick validation without spending money
+  - `"not cost_paid"` -- Exclude all tests that incur charges
+  - `"coverage_smoke and license_oss"` -- Test against remote OSS environment
+  - `"license_ee"` -- Test against remote EE environment
+
+## Design rules
+
+- **Dimension application:** Dimensions apply primarily to E2E tests. Unit tests generally do not need dimension markers.
+- **`coverage` semantics:** Running with `coverage_full` (or no coverage filter) means all tests run. `full` is not a separate tier to mark individually -- it means "no filter applied."
+- **`scope` in API/SDK:** Handled via directory structure (e.g., `pytest/e2e/workflows/`, `pytest/e2e/evaluations/`) rather than explicit markers. Web uses explicit `@scope:` tags.
+- **`license` has dual usage:** Tests are organized structurally by folder (`oss/tests/` vs `ee/tests/`) for clarity. Explicit markers (`@pytest.mark.license_oss` / `@license:oss` tags) enable filtering when testing against remote environments where the folder structure doesn't indicate the remote server's license (e.g., running local tests against a remote staging server). Use markers when targeting specific remote license environments.
+- **Interface-specific values:** Some shared dimensions have interface-specific values:
+  - `coverage`: API/SDK use `smoke`/`full`; Web adds `sanity`/`light`
+  - `role`: API/SDK include `admin`; Web uses `owner`/`editor`/`viewer`
+  - `plan`: API/SDK include all tiers; Web typically uses `hobby`/`pro`
+- **`cost` dimension clarifications:**
+  - Mark `cost_free` / `@cost:free` if the test only exercises code, local services, internal APIs, or free external services (e.g., public APIs with no usage limits).
+  - Mark `cost_paid` / `@cost:paid` if the test makes calls to paid third-party services where execution incurs monetary charges (LLM APIs like OpenAI/Anthropic/Cohere, cloud services with per-request pricing, etc.).
+  - Tests hitting our own API/services are `cost_free` unless the API itself proxies to a paid service.
+  - When in doubt: if running the test 1000 times would increase your cloud bill, mark it `cost_paid`.
diff --git a/docs/designs/testing/testing.fixtures.specs.md b/docs/designs/testing/testing.fixtures.specs.md
new file mode 100644
index 0000000000..4373512373
--- /dev/null
+++ b/docs/designs/testing/testing.fixtures.specs.md
@@ -0,0 +1,181 @@
+# Testing Fixtures --- Shared Test Infrastructure
+
+This document describes the reusable test utilities, fixtures, and support infrastructure across the monorepo. It covers per-interface fixtures, shared support utilities, account management, and fixture scoping rules.
+
+For per-interface specifics, see [testing.interface.api.specs.md](testing.interface.api.specs.md), [testing.interface.sdk.specs.md](testing.interface.sdk.specs.md), [testing.interface.web.specs.md](testing.interface.web.specs.md).
+For folder layout of test support files, see [testing.structure.specs.md](testing.structure.specs.md).
+
+---
+
+## API fixtures
+
+Defined in `api/oss/tests/pytest/utils/` and imported via `api/oss/tests/pytest/conftest.py`.
+
+### Environment (`utils/env.py`)
+
+| Fixture | Scope | Source | Returns |
+|---------|-------|--------|---------|
+| `ag_env` | session | `AGENTA_API_URL`, `AGENTA_AUTH_KEY` env vars | `{"api_url": str, "auth_key": str}` |
+
+Asserts both variables are set. Fails fast if missing.
+
+### API clients (`utils/api.py`)
+
+| Fixture | Scope | Depends on | Returns |
+|---------|-------|-----------|---------|
+| `unauthed_api` | session | `ag_env` | Callable `(method, endpoint, **kwargs) -> Response` |
+| `authed_api` | class | `cls_account` | Callable `(method, endpoint, **kwargs) -> Response` with `Authorization` header |
+
+- `unauthed_api` uses a shared `requests.Session`. Session is closed after all tests.
+- `authed_api` injects `Authorization: <credentials>` header from the account fixture. Does not use a shared session.
+- Both use `BASE_TIMEOUT = 10` seconds (from `utils/constants.py`).
+
+### Account fixtures (`utils/accounts.py`)
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `cls_account` | class | Creates a test account, shared within a test class |
+| `mod_account` | module | Creates a test account, shared across classes in a module |
+| `foo_account` | function | Creates a test account per test function (full isolation) |
+
+All three call `create_account(ag_env)` which:
+1. POSTs to `/admin/account` with `Authorization: Access <auth_key>` header
+2. Extracts `credentials` from the first scope in the response
+3. Returns `{"api_url": str, "credentials": str}`
+
+---
+
+## SDK fixtures
+
+Defined in `sdk/tests/integration/conftest.py`.
+
+### Credential management
+
+| Fixture/Helper | Type | Purpose |
+|----------------|------|---------|
+| `get_api_credentials()` | Function | Returns `(host, api_key)` from `AGENTA_HOST` (default: `https://cloud.agenta.ai`) and `AGENTA_API_KEY` |
+| `credentials_available()` | Function | Returns `bool` --- whether `AGENTA_API_KEY` is set |
+| `_skip_integration_if_missing_credentials` | autouse fixture | Skips tests marked `@pytest.mark.integration` when credentials are missing |
+| `requires_credentials` | Skip marker | `@pytest.mark.skipif` decorator for non-marker-based skipping |
+| `api_credentials` | session fixture | Returns `(host, api_key)`. Skips test if credentials are missing. |
+
+### SDK initialization
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `agenta_init` | function | Calls `ag.init(host, api_key)` then `_force_reinit_sdk()` to rebind httpx clients to the current event loop |
+
+`_force_reinit_sdk()` resets the `AgentaSingleton`'s `api` and `async_api` clients by creating new `AgentaApi` and `AsyncAgentaApi` instances. This is necessary because `pytest-asyncio` creates a new event loop for async tests, making previously-bound httpx clients stale.
+
+### Resource management
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `test_app` | function | Creates app via `AppManager.create()`, yields `{app_id, app_slug, response}`, deletes on teardown |
+| `test_variant` | function | Creates variant via `SharedManager.add()`, yields `{variant_slug, variant_id, app_id, app_slug, response}`, deletes on teardown |
+| `unique_app_slug` | function | Returns `f"test-app-{uuid4().hex[:8]}"` |
+| `unique_variant_slug` | function | Returns `f"test-variant-{uuid4().hex[:8]}"` |
+| `deterministic_testset_name` | session | Returns `"sdk-it-testset-v1"` --- deterministic to avoid proliferation |
+| `deterministic_evaluator_slug` | session | Returns `"sdk-it-evaluator-v1"` |
+| `deterministic_legacy_application_slug` | session | Returns `"sdk-it-legacy-app-v1"` |
+
+### Cleanup helpers
+
+| Helper | Purpose |
+|--------|---------|
+| `cleanup_app_safe(app_id)` | Deletes app, catches and logs errors |
+| `cleanup_variant_safe(variant_id, variant_slug, app_id)` | Deletes variant, catches and logs errors |
+
+### OTLP support
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `otlp_flat_span_factory` | session | Returns `make_otlp_flat_span()` factory for creating `OTelFlatSpanInput` objects |
+
+---
+
+## Web fixtures
+
+Defined in `web/tests/tests/fixtures/`.
+
+### Base fixture (`base.fixture/`)
+
+| Helper | Purpose |
+|--------|---------|
+| `apiHelpers/` | API request utilities for test setup/teardown |
+| `uiHelpers/` | DOM interaction helpers (click, fill, wait) |
+| `llmKeysSettingsHelpers/` | LLM provider key configuration |
+
+### User fixture (`user.fixture/`)
+
+| Helper | Purpose |
+|--------|---------|
+| `authHelpers/` | Authentication flows --- email/password account creation and login |
+
+### Session fixture (`session.fixture/`)
+
+Manages browser session persistence via `state.json` storage state. Used by Playwright for authenticated test sessions.
+
+### Global setup/teardown
+
+- `web/tests/playwright/global-setup/` --- Runs before all tests: creates accounts, sets up auth state
+- `web/tests/playwright/global-teardown/` --- Runs after all tests: cleanup
+
+---
+
+## Support utilities (target)
+
+The `_support/` directory pattern provides shared test helpers. Target structure for API and SDK:
+
+```
+tests/_support/
+  fakes.py          # In-memory fake implementations of ports/interfaces
+  builders.py       # Factory functions for domain objects and DTOs
+  assertions.py     # Common assertion helpers (e.g., assert_has_attr)
+```
+
+### Fakes
+
+In-memory implementations of DAO interfaces (ports) are provided for Core unit tests. They store data in dicts/lists, support create/read/update/delete operations, and return realistic domain objects. They do not depend on SQLAlchemy, asyncpg, or any DB infrastructure.
+
+### Builders
+
+Factory functions create domain objects with sensible defaults:
+```python
+def build_workflow(*, slug="test", name="Test Workflow", **overrides):
+    return Workflow(slug=slug, name=name, **overrides)
+```
+
+### Assertions
+
+Reusable assertion helpers are provided for common patterns:
+```python
+def assert_has_attr(obj, attr_name):
+    assert hasattr(obj, attr_name), f"{type(obj).__name__} missing attribute '{attr_name}'"
+```
+
+---
+
+## Account management
+
+Both API and SDK tests create test accounts programmatically:
+
+- **API tests:** POST to `/admin/account` with `Authorization: Access <AGENTA_AUTH_KEY>`. Returns scoped credentials. Different fixture scopes (class/module/function) control account reuse.
+- **SDK integration tests:** Use `AGENTA_API_KEY` directly. No account creation --- the key is pre-provisioned.
+
+---
+
+## Fixture scoping rules
+
+| Scope | Pytest | When to use |
+|-------|--------|-------------|
+| `session` | Once per test run | Environment variables, shared HTTP sessions, read-only configuration |
+| `module` | Once per `.py` file | Account/resource setup shared across multiple test classes |
+| `class` | Once per test class | Account/resource setup shared within a class (`TestXxxBasics`) |
+| `function` | Once per test | Full isolation --- tests that mutate state or need unique resources |
+
+**Guidelines:**
+- The broadest scope that does not cause test interference is preferred.
+- Account fixtures should match the scope of the test class using them (typically `class`).
+- Resources that tests mutate should be `function`-scoped.
+- `yield`-based fixtures are preferred for cleanup over `try/finally` (unless cleanup needs the fixture value after yield).
diff --git a/docs/designs/testing/testing.initial.specs.md b/docs/designs/testing/testing.initial.specs.md
new file mode 100644
index 0000000000..5438edefb9
--- /dev/null
+++ b/docs/designs/testing/testing.initial.specs.md
@@ -0,0 +1,378 @@
+# Ports & Adapters Testing Strategy (Pytest)
+*(Unit-only layers + one E2E, plus utils/helpers)*
+
+This document captures the full context of the discussion and the resulting testing strategy for a **ports & adapters (hexagonal)** architecture using **FastAPI**, **SQLAlchemy async**, and **asyncpg**, with **inversion of control** wiring.
+
+---
+
+## Context: the architecture you described
+
+You currently have **inversion of control** / dependency injection wiring roughly like:
+
+1. **Outbound adapter (DB)**: Create a SQLAlchemy **engine** (async, asyncpg driver) and create a DAO implementation per entity.
+2. **Core**: Core defines a **DAO interface (port)**. Core services are created by passing an implementation of that port (the DAO).
+3. **Inbound adapter (HTTP)**: Routers receive Core services.
+4. Compose routes into a FastAPI app and run it.
+
+So dependencies flow "inward":
+- Routers depend on Core services.
+- Core depends on ports (interfaces).
+- Adapters implement ports (DAOs) and depend on infrastructure (SQLAlchemy session/engine).
+- The composition root wires everything together.
+
+You explicitly want:
+- Clear separation between **Core**, **routers**, and **DAOs**
+- **Unit tests** for each layer using mocks/fakes (not a running DB/server)
+- **One E2E** test suite that runs the real API with the real DB
+- Additionally: **unit tests for utils/helpers**
+
+You also explicitly requested to **drop integration tests** (e.g., DAO↔real Postgres component tests).
+
+---
+
+## Boundaries vs dimensions (API testing only, for now)
+
+**Boundaries** describe *where* tests live in the architecture.
+**Dimensions** describe *how* E2E tests are filtered or categorized.
+These are orthogonal concerns.
+
+Current state:
+- The existing API test suite is **E2E/system only** (remote HTTP + real DB).
+- The other boundaries are planned but not populated yet by the current API tests.
+
+### Boundaries (API testing only)
+1. **Utils/helpers** (pure unit)
+2. **Core services** (unit; mock/fake ports)
+3. **DAOs** (unit; mock AsyncSession)
+4. **Routers** (unit; in-process ASGI with mocked services)
+5. **E2E/system** (real DB + real API wiring)
+
+---
+
+## Dimensions (E2E only)
+
+Dimensions apply **only** to E2E tests, and do **not** apply to unit-layer tests.
+
+### API E2E dimensions (pytest runner)
+
+| Dimension | Values | Notes |
+|---|---|---|
+| license | oss, ee | |
+| role | owner, admin, editor, viewer | |
+| plan | hobby, pro, business, enterprise | |
+| path | happy, grumpy | `--happy` / `--grumpy` |
+| case | typical, edge | `--typical` / `--edge` |
+| lens | functional, performance, security | `--functional` / `--performance` / `--security` |
+| speed | fast, slow | `--fast` / `--slow` |
+| coverage | smoke, full | `full` = no coverage filter |
+
+Required environment variables for API E2E:
+- `AGENTA_API_URL`
+- `AGENTA_AUTH_KEY`
+
+Notes:
+- `--coverage full` means **no coverage filter** is applied.
+- `scope` is intentionally excluded for now.
+
+### Web E2E dimensions (Playwright)
+
+Source: `/Users/junaway/Agenta/github/agenta/web/tests/README.md` and `playwright/config/testTags.ts`
+
+| Dimension | Values | Notes |
+|---|---|---|
+| coverage | smoke, sanity, light, full | |
+| path | happy, grumpy | |
+| case | typical, edge | |
+| lens | functional, performance, security | |
+| speed | fast, slow | |
+| license | oss, ee | Depends on preset |
+| permission | owner, editor, viewer | |
+| entitlement | hobby, pro | |
+| feature-scope | ee | Feature availability |
+| env/preset | local, staging, beta, prod, demo, oss | |
+
+Required environment variables for Web E2E:
+- `TESTMAIL_API_KEY`
+- `TESTMAIL_NAMESPACE`
+- `AGENTA_OSS_OWNER_PASSWORD` (OSS runs only)
+- `AGENTA_OSS_OWNER_EMAIL` (optional for OSS)
+- `AGENTA_API_URL` (used for teardown and API flows)
+
+Notes:
+- `scope` exists in the web runner but is intentionally excluded here.
+
+---
+
+## The requested testing scope (what to test and what not to test)
+
+### You want to test (unit level)
+1. **Utils / helpers**
+2. **Core** (application/domain services) — not routers, not DAOs
+3. **Outbound adapters (DAOs)**, but via mocking the session/DB boundary (no running DB)
+4. **Inbound adapters (routers/APIs)** via mocking services and running handlers in-process
+
+### You do *not* want in unit tests
+- A running **Postgres**
+- A running **web server process**
+- Any "fake Postgres server" or DB emulator
+
+### You want to test (end-to-end level)
+- A **real system**: API + DB running (or app in-process + real DB), as one E2E suite
+
+---
+
+## Why SQLite in-memory is not useful for Core tests
+
+You clarified that you want to test **Core**, not routers/DAOs.
+
+For Core tests:
+- Core should depend on **ports** (interfaces) and should not know about SQL, sessions, engines, or HTTP.
+- Using **SQLite in-memory** introduces an adapter dependency into what should be a pure unit test.
+- If you are mocking the DAO anyway, SQLite is redundant.
+- If you are not mocking the DAO, you are no longer testing "Core only"; you're testing a persistence adapter too.
+
+**Conclusion:** For Core unit tests, prefer **mock/fake implementations of the DAO port** (pure Python), not SQLite.
+
+---
+
+## The final test pyramid you requested
+
+You requested a strategy with:
+
+1. **Unit tests: utils/helpers**
+2. **Unit tests: Core services** (mock DAO port)
+3. **Unit tests: DAOs** (mock SQLAlchemy AsyncSession — not engine)
+4. **Unit tests: routers** (mock Core services; in-process ASGI)
+5. **E2E tests: one suite** (real DB + real API wiring)
+
+No separate "integration tests" layer.
+
+---
+
+# Unit tests
+
+## 1) Utils / helpers tests (pure unit)
+
+### What belongs here
+- parsing/formatting utilities (IDs, dates, pagination tokens)
+- validators and normalizers
+- deterministic encoding/serialization (flatten/unflatten, safe encoders)
+- hashing helpers
+- small algorithms used by Core or adapters
+- error mapping utilities *as long as they are not bound to SQLAlchemy/HTTP specifics*
+
+### How to test
+- direct function calls
+- table-driven tests (`pytest.mark.parametrize`)
+- (optional) property-based tests for parsers/encoders
+
+### Tradeoffs
+**Pros**
+- fastest tests
+- high signal: pure determinism, easy to cover edge cases
+- no mocking needed
+
+**Cons**
+- avoid testing trivial wrappers around libraries unless you're encoding business rules
+- don't create brittle tests that lock in implementation details
+
+---
+
+## 2) Core unit tests (mock the DAO port)
+
+### What you test
+- invariants and state transitions
+- orchestration across ports (repo/DAO, clock, id generator, event bus, external clients)
+- domain-level error mapping (e.g., `AlreadyExists`, `NotFound`)
+- idempotency logic (in-memory fake makes this easy)
+- emitted domain events / commands (if you have them)
+
+### What you inject
+- **Fake** or **Mock** for the DAO interface (port)
+
+**Preference: fakes over mocks**
+- Use **fakes** when Core behavior depends on persistence state (e.g., create then fetch; idempotency; sequences).
+- Use **mocks** when you only care about an interaction (called once, called with specific args).
+
+### Tradeoffs
+**Pros**
+- isolates Core perfectly
+- extremely fast and stable
+- focuses on business logic and contracts
+
+**Cons**
+- if Core leaks adapter concerns (SQLAlchemy models/sessions), test isolation gets hard
+- correctness of SQL queries is not validated here (by design)
+
+---
+
+## 3) DAO unit tests (mock SQLAlchemy AsyncSession)
+
+You confirmed you use **asyncpg with SQLAlchemy**.
+
+### The seam to mock
+Even though you "create an engine and pass it to the DAO", for unit tests the clean boundary is:
+
+- mock **`AsyncSession`** (or a session factory / `async_sessionmaker`), not the engine
+
+Why:
+- DAOs typically call `session.execute(...)`, `session.commit()`, etc.
+- Engine mocking pushes you into internal plumbing (connections/pooling/begin blocks), which is brittle
+- Mocking sessions gives you "did the DAO send the right request?" without running a DB
+
+### What DAO unit tests should cover
+- **statement construction** (SQLAlchemy statement shape)
+- **bound parameters** (values, required params present)
+- call sequence (execute/commit/rollback if DAO controls it)
+- row-to-domain mapping
+- exception mapping:
+  - SQLAlchemy/driver exceptions → your domain persistence errors
+
+### Two common assertion styles
+1) **Fake session records calls**
+   - assert that `execute()` was called with a statement and params
+2) **Compile statement using Postgres dialect**
+   - compile SQLAlchemy statement with `postgresql.dialect()`
+   - assert on **SQL fragments** + **compiled params**
+   - avoid exact-string comparisons to reduce brittleness
+
+### Tradeoffs (important)
+**Pros**
+- fast and deterministic
+- verifies your adapter's request construction and mapping logic
+- enforces the adapter-to-port contract at unit level
+
+**Cons**
+- cannot validate real Postgres semantics (JSONB operators, ON CONFLICT behavior, type casting, locks, query planner)
+- may go "green" while Postgres rejects the query in reality
+- therefore your E2E suite becomes the only semantic safety net for DB behavior
+
+*(This is the explicit tradeoff you accept when skipping adapter integration tests.)*
+
+---
+
+## 4) Router unit tests (mock services, in-process ASGI)
+
+You said "I don't need a running backend."
+So router tests should be in-process:
+
+- build FastAPI app
+- mount routes
+- dependency-inject (override dependencies) with mocked services
+- use `httpx.AsyncClient` or FastAPI TestClient to call endpoints
+
+### What routers tests cover
+- request parsing and validation (422)
+- status codes and response shapes
+- error mapping at HTTP boundary
+- auth boundary behaviors (if implemented in router/middleware)
+- pagination inputs/outputs
+- content negotiation (JSON, files, etc.)
+
+### Tradeoffs
+**Pros**
+- no server process
+- fast feedback
+- protects API contract and translations
+
+**Cons**
+- does not validate full wiring with DAOs (by design at unit level)
+- cannot validate actual network stack behavior (TLS, reverse proxy headers, etc.)
+
+---
+
+# E2E tests (one suite)
+
+Since you are skipping integration tests, E2E is your only "real dependency" validation.
+
+## What E2E must validate (because nothing else will)
+1. Wiring across layers: routers → core → dao → db
+2. Postgres semantics that mocks can't catch:
+   - constraints (unique/fk)
+   - transactionality and rollbacks
+   - Postgres-specific features you use (JSONB, FTS, ON CONFLICT, RETURNING, etc.)
+   - driver error shapes / mapping correctness
+
+## Keep E2E small but targeted
+A minimal E2E suite that pays for itself:
+- **happy path CRUD** for 1–2 key entities
+- **constraint case** (unique violation) to validate error mapping
+- **transaction case** (force mid-operation failure; ensure rollback)
+- **idempotency/concurrency-ish case** if relevant (even a simple repeat request)
+
+## How to run E2E
+- spin a real Postgres (docker-compose or testcontainers)
+- run migrations
+- run the FastAPI app (either:
+  - in-process ASGI client with the real DI wiring, OR
+  - as a process and call it over HTTP)
+
+---
+
+# Recommended project layout (matches the above)
+
+```
+tests/
+  unit/
+    utils/
+      test_*.py
+    core/
+      test_*.py
+    adapters/
+      db/
+        test_*.py
+      http/
+        test_*.py
+  e2e/
+    test_*.py
+tests/_support/
+  fakes.py
+  builders.py
+  assertions.py
+```
+
+Where `tests/_support` contains:
+- InMemory/Fake repositories (ports)
+- Fake session/result objects for DAO unit tests
+- common builders for domain objects/DTOs
+- minimal assertion helpers
+
+---
+
+# Practical mocking guidance per layer
+
+## Core
+- Mock/fake **ports** (DAO interface, clock, id generator)
+- Avoid coupling tests to SQLAlchemy types or HTTP DTOs
+
+## DAO
+- Mock **AsyncSession** (and result objects)
+- Optionally compile statements with **Postgres dialect** and assert fragments/params
+- Test exception mapping with `sqlalchemy.exc.IntegrityError` and/or asyncpg error types if you map them
+
+## Routers
+- Mock Core services
+- Override dependencies in FastAPI
+- Assert status codes and response schemas
+
+## E2E
+- Real DI + real DB + migrations
+- Small suite, high-value scenarios
+
+---
+
+# Summary of the key tradeoffs you accepted
+
+By choosing **unit tests only** for Core/DAO/router/utils and **one E2E suite**, you gain:
+- simplicity
+- speed
+- strong boundary testing via mocks
+
+But you accept:
+- fewer early signals for Postgres-specific issues
+- higher reliance on E2E to catch SQL/transaction/type/constraint semantics
+- potential "green unit tests, red E2E" when SQL is wrong or dialect-specific
+
+Given that constraint, the best mitigation is:
+- keep DAO unit assertions focused on statement structure + params (not exact SQL)
+- make the E2E suite intentionally include at least 1–2 tests that exercise the Postgres features you actually rely on
diff --git a/docs/designs/testing/testing.interface.api.specs.md b/docs/designs/testing/testing.interface.api.specs.md
new file mode 100644
index 0000000000..f881799b31
--- /dev/null
+++ b/docs/designs/testing/testing.interface.api.specs.md
@@ -0,0 +1,172 @@
+# API Testing — Interface Specification
+
+The API interface is the FastAPI HTTP layer consumed by the SDK, Web frontend, and third-party integrations. This document describes the current test state, target state, and conventions specific to the API.
+
+For architectural layer definitions, see [testing.boundaries.specs.md](testing.boundaries.specs.md).
+For dimension/marker taxonomy, see [testing.dimensions.specs.md](testing.dimensions.specs.md).
+For folder layout, see [testing.structure.specs.md](testing.structure.specs.md).
+For fixtures and utilities, see [testing.fixtures.specs.md](testing.fixtures.specs.md).
+
+---
+
+## Current state
+
+### E2E test suite (`api/oss/tests/pytest/`)
+
+The existing test suite is E2E/system-level: tests make HTTP requests to a running API backed by a real database.
+
+**Test domains covered (155 tests):**
+
+| Domain | Test files | Scope |
+|--------|-----------|-------|
+| Workflows | `test_workflows_basics.py`, `test_workflows_queries.py`, `test_workflows_retrieve.py`, `test_workflow_variants_basics.py`, `test_workflow_variants_queries.py`, `test_workflow_revisions_basics.py`, `test_workflow_revisions_queries.py`, `test_workflow_lineage.py` | CRUD, variants, revisions, lineage, retrieve |
+| Evaluations | `test_evaluation_runs_basics.py`, `test_evaluation_runs_queries.py`, `test_evaluation_scenarios_basics.py`, `test_evaluation_scenarios_queries.py`, `test_evaluation_steps_basics.py`, `test_evaluation_steps_queries.py`, `test_evaluation_metrics_basics.py`, `test_evaluation_metrics_queries.py` | Runs, scenarios, steps, metrics |
+| Testsets | `test_testsets_basics.py`, `test_testsets_queries.py`, `test_testsets_files.py`, `test_testcases_basics.py` | Testsets, testcases, file uploads |
+| Evaluators | `test_evaluators_basics.py`, `test_evaluators_queries.py` | CRUD, queries |
+| Annotations | `test_annotations_basics.py`, `test_annotations_queries.py` | CRUD, queries |
+| Tracing | `test_traces_basics.py`, `test_spans_basics.py`, `test_spans_queries.py` | Traces, spans |
+| Healthchecks | `test_healthchecks.py` | Connectivity |
+
+### EE test suite (`api/ee/tests/pytest/`)
+
+- `test_billing_period.py` — Multivariate tests for `compute_billing_period()` (12 months x 7 days x various anchors, including leap year edge cases).
+
+### Legacy tests (`api/oss/tests/legacy/`)
+
+54 Python test files. Not operational — excluded from `api/pytest.ini` test paths. Kept for reference.
+
+### Manual tests (`api/ee/tests/manual/`)
+
+`.http` files for manual testing of billing and auth flows. Not automated.
+
+### Configuration
+
+- **Config file:** `api/pytest.ini`
+- **Test paths:** `oss/tests/pytest`, `ee/tests/pytest`
+- **Async mode:** `auto` (via `pytest-asyncio`)
+- **Markers:** See [testing.dimensions.specs.md](testing.dimensions.specs.md) for the full marker list.
+
+### Fixtures
+
+See [testing.fixtures.specs.md](testing.fixtures.specs.md) for full details. Key fixtures:
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `ag_env` | session | Reads `AGENTA_API_URL` and `AGENTA_AUTH_KEY` from environment |
+| `unauthed_api` | session | Pre-configured `requests.Session` for unauthenticated endpoints |
+| `authed_api` | class | Pre-configured request function with `Authorization` header |
+| `cls_account` | class | Creates a test account via `POST /admin/account` |
+| `mod_account` | module | Module-scoped test account |
+| `foo_account` | function | Function-scoped test account |
+
+---
+
+## Target state
+
+Apply the full [test pyramid](testing.principles.specs.md) to the API:
+
+### Layer 1: Utils/helpers unit tests
+
+**Location:** `api/oss/tests/pytest/unit/utils/`
+
+**Targets:**
+- Parsing/formatting utilities in `api/oss/src/apis/fastapi/shared/utils.py`
+- Pagination helpers in `api/oss/src/dbs/postgres/shared/utils.py`
+- Normalization helpers in domain-specific `utils.py` files
+- Error mapping utilities
+
+**Pattern:** `pytest.mark.parametrize` with input/output pairs.
+
+### Layer 2: Core service unit tests
+
+**Location:** `api/oss/tests/pytest/unit/core/`
+
+**Targets:**
+- Services in `api/oss/src/core/<domain>/service.py`
+- Test with fake DAO port implementations (in-memory dicts)
+- Verify invariants, orchestration, domain error mapping
+
+**Pattern:** Inject fakes for all ports. Use `pytest/_support/fakes.py` for shared fake implementations.
+
+### Layer 3: DAO unit tests
+
+**Location:** `api/oss/tests/pytest/unit/adapters/db/`
+
+**Targets:**
+- DAOs in `api/oss/src/dbs/postgres/<domain>/dao.py`
+- Mock `AsyncSession`
+- Verify statement construction, bound parameters, row mapping, exception mapping
+
+**Pattern:** Two assertion styles per [testing.boundaries.specs.md](testing.boundaries.specs.md): fake session or Postgres dialect compilation.
+
+### Layer 4: Router unit tests
+
+**Location:** `api/oss/tests/pytest/unit/adapters/http/`
+
+**Targets:**
+- Routers in `api/oss/src/apis/fastapi/<domain>/router.py`
+- Override FastAPI dependencies with mocked Core services
+- Test in-process via `httpx.AsyncClient`
+
+**Pattern:** Build minimal FastAPI app, mount route under test, override dependencies.
+
+### Layer 5: E2E tests (existing)
+
+The current E2E suite in `api/oss/tests/pytest/` moves to `api/oss/tests/pytest/e2e/` for consistency with the runner → type → domain hierarchy. See [testing.structure.specs.md](testing.structure.specs.md) for the full target layout.
+
+---
+
+## Mocking guidance (API-specific)
+
+| Layer | Mock target | What to assert |
+|-------|------------|----------------|
+| Core | DAO interface (port) | Return values, side effects, domain errors |
+| DAO | `AsyncSession` | Statement shape, bound params, call sequence, row mapping |
+| Router | Core service | Status codes, response shapes, error mapping |
+| E2E | Nothing | Full stack behavior |
+
+---
+
+## Conventions
+
+### Test class naming
+
+Follow the established pattern:
+- `TestXxxBasics` — CRUD operations (create, read, update, delete, list)
+- `TestXxxQueries` — Filtering, pagination, search
+- `TestXxxLineage` — Revision/variant lineage (for git-pattern resources)
+
+### Test method structure
+
+Use ARRANGE/ACT/ASSERT comment sections:
+```python
+def test_create_workflow(self, authed_api):
+    # ARRANGE
+    payload = {"slug": "test-workflow", "name": "Test Workflow"}
+
+    # ACT
+    response = authed_api("POST", "/api/workflows", json=payload)
+
+    # ASSERT
+    assert response.status_code == 200
+    data = response.json()
+    assert data["slug"] == "test-workflow"
+```
+
+### Fixture scoping
+
+- `session` — Environment setup, shared across all tests
+- `class` — Account/resource setup shared within a test class
+- `module` — Account/resource setup shared across classes in a module
+- `function` — Per-test isolation (use for tests that mutate state)
+
+---
+
+## Environment
+
+| Variable | Required | Purpose |
+|----------|----------|---------|
+| `AGENTA_API_URL` | Yes | Base URL of the running API |
+| `AGENTA_AUTH_KEY` | Yes | Admin key for creating test accounts |
+
+---
diff --git a/docs/designs/testing/testing.interface.sdk.specs.md b/docs/designs/testing/testing.interface.sdk.specs.md
new file mode 100644
index 0000000000..ffd9c3f3de
--- /dev/null
+++ b/docs/designs/testing/testing.interface.sdk.specs.md
@@ -0,0 +1,208 @@
+# SDK Testing — Interface Specification
+
+The SDK interface is the Python package (`agenta`) consumed by end users to interact with Agenta programmatically. This document describes the current test state, target state, and conventions specific to the SDK.
+
+For architectural layer definitions, see [testing.boundaries.specs.md](testing.boundaries.specs.md).
+For dimension/marker taxonomy, see [testing.dimensions.specs.md](testing.dimensions.specs.md).
+For folder layout, see [testing.structure.specs.md](testing.structure.specs.md).
+For fixtures and utilities, see [testing.fixtures.specs.md](testing.fixtures.specs.md).
+
+---
+
+## Current state
+
+### Unit tests (`sdk/tests/unit/`)
+
+**Coverage:**
+- `test_tracing_decorators.py` — Comprehensive tests for SDK tracing decorators
+  - Sync functions, async functions, generators, async generators
+  - Mock-based: mocks `ag.tracer` and `ag.tracing` to isolate decorator logic
+  - Test classes: `TestExistingFunctionality`, `TestGeneratorTracing`, `TestAsyncGeneratorTracing`
+
+**Supporting docs (in-tree):**
+- `sdk/tests/unit/README.md` — Quick start, running tests, adding new tests
+- `sdk/tests/unit/TESTING_PATTERNS.md` — Testing approaches and patterns
+
+### Integration tests (`sdk/tests/integration/`)
+
+Tests exercise SDK manager methods against a running Agenta API. These are SDK-level E2E tests that validate the SDK's HTTP client layer, serialization, and API contract.
+
+**Domains covered:**
+- `applications/` — `test_apps_shared_manager.py` (913+ lines): comprehensive sync/async CRUD, response serialization, error handling, concurrent operations
+- `evaluations/` — `test_evaluations_flow.py`: evaluation flow tests
+- `evaluators/` — Evaluator CRUD tests
+- `prompts/` — Prompt management tests
+- `testsets/` — Testset CRUD tests
+- `tracing/` — `test_observability_traces.py`: trace integration tests
+- `vault/` — Vault/secrets tests
+
+**Fixture infrastructure (`sdk/tests/integration/conftest.py`):**
+
+| Fixture | Scope | Purpose |
+|---------|-------|---------|
+| `api_credentials` | session | Reads `AGENTA_HOST` (default: `https://cloud.agenta.ai`) and `AGENTA_API_KEY`. Skips test if missing. |
+| `agenta_init` | function | Initializes SDK with `ag.init()` and forces httpx client rebinding for async test compatibility |
+| `test_app` | function | Creates app via `AppManager.create()`, yields `{app_id, app_slug}`, cleans up on teardown |
+| `test_variant` | function | Creates variant via `SharedManager.add()`, yields `{variant_slug, variant_id, app_id}`, cleans up |
+| `otlp_flat_span_factory` | session | Factory for `OTelFlatSpanInput` objects |
+| `deterministic_testset_name` | session | Returns `"sdk-it-testset-v1"` to avoid test resource proliferation |
+| `deterministic_evaluator_slug` | session | Returns `"sdk-it-evaluator-v1"` |
+
+**Credential management:**
+- `_skip_integration_if_missing_credentials` (autouse) — Skips tests marked `@pytest.mark.integration` when `AGENTA_API_KEY` is not set
+- `requires_credentials` — Skip decorator for non-marker-based conditional skipping
+
+### Smoke/healthcheck tests (`sdk/tests/pytest/`)
+
+- `healthchecks/test_healthchecks.py` — Basic API connectivity and auth validation
+- Uses the same fixture/marker system as the API tests (`ag_env`, `authed_api`, `unauthed_api`, account fixtures)
+
+### Legacy tests (`sdk/tests/legacy/`)
+
+Multiple legacy test suites covering annotations, baggage, custom workflows, debugging, management, observability, redact, routing. Not operational.
+
+### Configuration
+
+- **Config file:** `sdk/pytest.ini`
+- **Test paths:** `tests/pytest`
+- **Async mode:** `auto`
+- **Markers:** Identical to API markers (see [testing.dimensions.specs.md](testing.dimensions.specs.md))
+- **Dev dependencies:** `pytest ^9`, `pytest-asyncio ^1`, `pytest-xdist ^3`
+
+---
+
+## Unit / E2E split
+
+The SDK follows the same universal structure as all interfaces: `utils/`, `unit/`, `e2e/`. The dividing line is whether a test needs the backend running.
+
+### E2E (requires backend)
+
+E2E tests validate the SDK against the real system. They exercise the HTTP client layer, serialization, and API contract end-to-end.
+
+**Domains:**
+
+| Domain | What it tests | Examples |
+|--------|--------------|---------|
+| **Observability** | OTLP trace sending, span capture, trace querying | Send traces via SDK, confirm they appear in the system |
+| **Evaluations** | Evaluation SDK flows end-to-end | Run evaluations, write metrics, fetch results, confirm correctness |
+| **Integrations** | Pull: fetching secrets, entities, configs. Push: webhooks, notifications, events | Vault secrets CRUD, entity fetching, event delivery |
+| **Collaboration** | Messages, threads, annotations (future) | Thread creation, message posting |
+| **Workflows** | Custom workflow deployment and invocation requiring platform access | Workflows that need secrets, tracing hooks, or evaluation hooks |
+| **Healthchecks** | Connectivity and auth validation | Basic API reachability |
+
+### Unit (no backend)
+
+Unit tests run without the system. Anything that can be tested in isolation belongs here.
+
+**What goes in unit:**
+- Workflow decorator behavior (`@ag.workflow`, `@ag.route`, `@ag.instrument`) — stateless, no authorization needed
+- Route registration and parameter parsing
+- Manager method logic (request construction, response parsing) — mock `httpx` transport or Fern client
+- Configuration/initialization (`ag.init()`) — parameter combinations, env var handling, singleton behavior
+- Error handling — SDK error mapping from HTTP status codes to SDK exceptions
+- Retry/timeout logic — mocked transport returning errors
+- In some cases, workflows can run in a subprocess without the full system
+
+**What to mock:**
+- Mock `httpx` transport or the Fern-generated client (`AgentaApi`, `AsyncAgentaApi`), not the SDK's public API surface.
+- For workflow decorators: mock `ag.tracer` and `ag.tracing` to isolate decorator logic.
+- Test both sync and async code paths.
+
+---
+
+## Target state
+
+### E2E
+
+Organize by domain:
+
+```
+sdk/tests/pytest/e2e/
+  observability/              # OTLP, trace sending, span capture
+  evaluations/                # Evaluation flows, metrics
+  integrations/               # Secrets, entities, webhooks, events
+  collaboration/              # Messages, threads (future)
+  workflows/                  # Custom workflow deployment + invocation
+  healthchecks/               # Connectivity
+```
+
+### Unit
+
+Expand beyond tracing decorators:
+
+```
+sdk/tests/pytest/unit/
+  test_tracing_decorators.py  # Existing: workflow decorators
+  test_workflow_decorators.py  # Route creation, parameter parsing
+  test_managers.py             # Manager method logic (mock HTTP)
+  test_init.py                 # Configuration/initialization
+  test_errors.py               # Error handling
+```
+
+---
+
+## Conventions
+
+### Test class naming
+
+Follow the established pattern in `test_tracing_decorators.py`:
+- `TestExistingFunctionality` — Tests for known working behavior
+- `TestGeneratorTracing` — Tests for specific feature area
+- `TestAsyncGeneratorTracing` — Tests for async variant of feature
+
+### Mock setup
+
+```python
+@pytest.fixture
+def mock_tracer(mocker):
+    return mocker.patch("agenta.sdk.decorators.tracing.ag.tracer")
+```
+
+### Integration test naming
+
+- Use `sdk-it-` prefix for deterministic test resource names to avoid proliferation
+- Examples: `sdk-it-testset-v1`, `sdk-it-evaluator-v1`
+
+### SDK reinitialization
+
+Integration tests must force-reinitialize the SDK per test function to avoid stale httpx client references across event loops. The `agenta_init` fixture handles this via `_force_reinit_sdk()`.
+
+---
+
+## Environment
+
+| Variable | Required for | Default | Purpose |
+|----------|-------------|---------|---------|
+| `AGENTA_API_KEY` | Integration tests | None (test skips if missing) | API authentication |
+| `AGENTA_HOST` | Integration tests | `https://cloud.agenta.ai` | API base URL |
+
+---
+
+## Running tests
+
+```bash
+# All SDK tests (unit + E2E, E2E skips if no credentials)
+cd sdk && pytest tests/pytest/ -v
+
+# Unit tests only
+cd sdk && pytest tests/pytest/unit/ -v
+
+# E2E tests only (requires credentials)
+AGENTA_API_KEY=... AGENTA_HOST=... cd sdk && pytest tests/pytest/e2e/ -v
+
+# Specific E2E domain
+AGENTA_API_KEY=... cd sdk && pytest tests/pytest/e2e/observability/ -v
+
+# Specific test class
+cd sdk && pytest tests/pytest/unit/test_tracing_decorators.py::TestGeneratorTracing -v
+
+# With coverage
+cd sdk && pytest tests/pytest/unit/ --cov=agenta.sdk --cov-report=html
+```
+
+---
+
+## References
+
+- `sdk/tests/unit/README.md` — Quick start for SDK unit tests
+- `sdk/tests/unit/TESTING_PATTERNS.md` — Detailed testing patterns and module-specific guidance
diff --git a/docs/designs/testing/testing.interface.web.specs.md b/docs/designs/testing/testing.interface.web.specs.md
new file mode 100644
index 0000000000..989bfa55df
--- /dev/null
+++ b/docs/designs/testing/testing.interface.web.specs.md
@@ -0,0 +1,185 @@
+# Web Testing — Interface Specification
+
+The Web interface is the Next.js frontend consumed by users via browser. This document describes the current test state, target state, and conventions specific to the Web.
+
+For architectural layer definitions, see [testing.boundaries.specs.md](testing.boundaries.specs.md).
+For dimension/marker taxonomy, see [testing.dimensions.specs.md](testing.dimensions.specs.md).
+For folder layout, see [testing.structure.specs.md](testing.structure.specs.md).
+For fixtures and utilities, see [testing.fixtures.specs.md](testing.fixtures.specs.md).
+
+---
+
+## Current state
+
+### E2E tests (Playwright)
+
+**Runner:** `web/tests/` — Playwright v1.57.0
+
+**Configuration (`web/tests/playwright.config.ts`):**
+- Test directory: dynamically set via `PROJECT_DIRECTORY` env var
+- Single worker, no parallelization
+- Retries: 2 in CI, configurable locally
+- Timeouts: 60s per test, 60s for expectations
+- Artifacts: trace on first retry, screenshots only on failure, video retained on failure
+- Storage state: `state.json` for session persistence
+- Reporter: HTML
+- Browser: Desktop Chrome
+
+**Test organization (feature-numbered):**
+
+| Number | Area | OSS | EE |
+|--------|------|-----|-----|
+| 1 | Settings (API keys, model hub) | Yes | Yes |
+| 2 | App creation | Yes | Yes |
+| 3 | Playground (run variant) | Yes | Yes |
+| 4 | Prompt registry | Yes | Yes |
+| 5 | Testset management | Yes | Yes |
+| 6 | Auto-evaluation | No | Yes |
+| 7 | Observability | Yes | Yes |
+| 8 | Deployment | Yes | Yes |
+| 9 | Human annotation | No | Yes |
+
+**Global setup/teardown:**
+- Located in `web/tests/playwright/global-setup` and `global-teardown`
+- Requires testmail integration for email-based authentication
+
+**Tag system (`web/tests/playwright/config/testTags.ts`):**
+See [testing.dimensions.specs.md](testing.dimensions.specs.md) for the full taxonomy. Tags use the `@dimension:value` syntax (e.g., `@coverage:smoke`, `@path:happy`).
+
+### Data layer integration tests
+
+**Location:** `web/oss/tests/datalayer/`
+
+TypeScript-based tests that exercise Jotai atoms + TanStack Query against a live API:
+- `test-apps.ts` — Application state management
+- `test-observability.ts` — Observability state management
+
+Executed via `tsx` for TypeScript support.
+
+### Component unit tests
+
+**Location:** Colocated `__tests__/` directories near source code.
+
+**Example:** `web/oss/src/components/Playground/state/atoms/__tests__/core.test.ts`
+- Tests Jotai atoms using `createStore()` for isolated store instances
+- Tests `selectedVariantsAtom`, `viewTypeAtom`, mutation atoms
+- No DOM rendering, no API calls — pure state logic testing
+
+### Scripts (npm)
+
+**From `web/tests/package.json`:**
+- `pnpm test:e2e` — Run all E2E tests
+- `pnpm test:e2e:ui` — Run with Playwright UI mode
+- `pnpm test:e2e:debug` — Debug mode
+
+**From `web/package.json`:**
+- `pnpm test:datalayer` — All data layer tests
+- `pnpm test:apps` — App tests
+- `pnpm test:observability` — Observability tests
+- Plus: `test:revision-centric`, `test:environments`, `test:deployments`, `test:orgs`, `test:profile`, `test:workspace`, `test:project`, `test:newPlayground`
+
+---
+
+## Boundaries applied to Web
+
+The Web has a different architecture than the API. The relevant boundaries are:
+
+| Boundary | Web equivalent | Status |
+|----------|---------------|--------|
+| Utils/helpers (pure unit) | Pure utility functions, formatters, validators | Minimal |
+| Core/business logic | Jotai atoms, derived selectors, mutation atoms | Partially exists (Playground atoms) |
+| Adapter unit | N/A (browser is the adapter) | N/A |
+| E2E/system | Playwright browser tests + data layer integration tests | Exists |
+
+**What to test at the component unit level:**
+- Jotai atoms with `createStore()` — test state transitions in isolation
+- Derived atoms (selectors) — test computation logic
+- Mutation atoms (write-only atoms) — test side effects and state updates
+- Pure utility functions — formatters, validators, parsers
+
+**What NOT to test at the component unit level:**
+- DOM rendering or component markup (use E2E for this)
+- API calls (use data layer integration tests for this)
+- Browser-specific behavior (use Playwright for this)
+
+---
+
+## E2E test types
+
+Playwright E2E tests fall into two categories:
+
+1. **UI tests** — Full browser interaction: clicking, typing, navigating, asserting on rendered pages. These validate user-facing flows end-to-end.
+2. **Internal API tests** — Playwright-driven tests that exercise the frontend's data fetching and API integration without necessarily asserting on UI rendering. Useful for validating data layer behavior in a real browser context.
+
+Both types use the same Playwright runner, fixtures, and tag system.
+
+---
+
+## Target state
+
+### E2E (Playwright)
+
+The existing feature-numbered suites continue. Both UI and internal API test types are organized in the same numbered structure.
+
+### Unit tests
+
+**Current limitation:** React components in this codebase do not use dependency injection. Without DI, it is not practical to unit-test components in isolation (mocking props/context becomes fragile and couples tests to implementation).
+
+**Phase 1 (now):** Focus on what can be tested without DI:
+1. **Utils** — Pure utility functions in `lib/helpers/`, formatters, validators. No DI needed.
+2. **Atom/store tests** — Jotai atoms with `createStore()`. Each major feature (playground, evaluations, observability, testsets) should have `__tests__/` directories.
+3. **Molecule/bridge pattern tests** — Test the molecule and bridge patterns from `@agenta/entities` using their imperative APIs (`molecule.get.*`, `molecule.set.*`).
+4. **Package utility tests** — Test utilities exported from `@agenta/shared/utils`, `@agenta/ui`, and other workspace packages.
+
+**Phase 2 (when DI is available):** Once components adopt dependency injection (via providers, context, or atom-based injection):
+- Component-level unit tests with mocked dependencies
+- Test boundary layers analogous to API (state management, data fetching, rendering)
+
+---
+
+## E2E guide references
+
+The following in-tree guides provide detailed procedural documentation for writing and maintaining Playwright E2E tests. This spec does not duplicate their content.
+
+| Guide | Location | What it covers |
+|-------|----------|---------------|
+| E2E Test Generation | `web/tests/guides/E2E_TEST_GENERATION_GUIDE.md` | Converting Playwright codegen output to production tests |
+| E2E Test Organization | `web/tests/guides/E2E_TEST_ORGANIZATION_GUIDE.md` | Folder structure, naming, OSS/EE sharing |
+| Utilities and Fixtures | `web/tests/guides/UTILITIES_AND_FIXTURES_GUIDE.md` | apiHelpers, uiHelpers, selector patterns |
+| Recording Guide | `web/tests/guides/RECORDING_GUIDE.md` | Using Playwright codegen for recording |
+
+---
+
+## Conventions
+
+### File naming
+- `*.spec.ts` — Playwright E2E tests
+- `*.test.ts` — Component unit tests
+- `__tests__/` — Colocated test directories next to source
+
+### Fixture imports
+E2E tests use a layered fixture system:
+- `base.fixture` — API helpers, UI helpers, LLM key settings
+- `user.fixture` — Authentication flows, email/password account creation
+- `session.fixture` — Browser session management
+
+### Tag application
+Every E2E test should include at minimum `@coverage:` and `@path:` tags:
+```typescript
+test("create app @coverage:smoke @path:happy", async ({ page }) => {
+  // ...
+})
+```
+
+---
+
+## Environment
+
+| Variable | Required for | Purpose |
+|----------|-------------|---------|
+| `TESTMAIL_API_KEY` | E2E tests | Email-based auth flow testing |
+| `TESTMAIL_NAMESPACE` | E2E tests | Testmail namespace |
+| `AGENTA_OSS_OWNER_PASSWORD` | E2E tests (OSS only) | OSS owner account password |
+| `AGENTA_OSS_OWNER_EMAIL` | E2E tests (OSS, optional) | OSS owner email |
+| `AGENTA_API_URL` | E2E teardown, API flows | API base URL |
+| `NEXT_PUBLIC_AGENTA_API_URL` | Data layer tests | API URL for frontend |
diff --git a/docs/designs/testing/testing.interfaces.specs.md b/docs/designs/testing/testing.interfaces.specs.md
new file mode 100644
index 0000000000..435a9c7ec5
--- /dev/null
+++ b/docs/designs/testing/testing.interfaces.specs.md
@@ -0,0 +1,78 @@
+# Testing Interfaces
+
+An interface is a system surface that external consumers interact with. Each interface has its own test infrastructure, execution environment, and applicable subset of [boundaries](testing.boundaries.specs.md).
+
+This document provides a high-level overview. For detailed per-interface specifications, see the dedicated documents linked below.
+
+---
+
+## Interfaces
+
+| Interface | Description | Runner | Dedicated Spec |
+|-----------|-------------|--------|----------------|
+| **API** | FastAPI HTTP endpoints consumed by the SDK, Web frontend, and third-party integrations | Pytest | [testing.interface.api.specs.md](testing.interface.api.specs.md) |
+| **SDK** | Python SDK consumed by end users to interact with Agenta programmatically | Pytest | [testing.interface.sdk.specs.md](testing.interface.sdk.specs.md) |
+| **Web** | Next.js frontend consumed by users via browser | Playwright + Jest/Vitest | [testing.interface.web.specs.md](testing.interface.web.specs.md) |
+| **Services** | Background workers, Celery tasks, and non-HTTP backend services | Pytest | Planned |
+| **Docs** | Docusaurus documentation site (link checking, build validation) | Scripts | Planned |
+
+**Future interfaces** (not yet scoped):
+- **MCP** — Model Context Protocol server for AI agent integration.
+- **Agents** — Agent-facing APIs and workflows.
+
+---
+
+## Interface x boundary matrix
+
+This matrix shows which [boundaries](testing.boundaries.specs.md) apply to each interface, and the current state of test coverage.
+
+| Boundary | API | SDK | Web | Services | Docs |
+|----------|-----|-----|-----|----------|------|
+| **Utils/helpers** (pure unit) | Planned | Exists (tracing decorators) | Exists (atom tests) | Planned | N/A |
+| **Core services** (unit, mock ports) | Planned | Planned | N/A | Planned | N/A |
+| **Adapters — outbound/DB** (unit, mock session) | Planned | N/A | N/A | Planned | N/A |
+| **Adapters — inbound/HTTP** (unit, in-process) | Planned | N/A | N/A | N/A | N/A |
+| **E2E/system** (real dependencies) | Exists (155 tests) | Exists (integration suite) | Exists (Playwright suites) | Planned | Planned (scripts) |
+
+**Key observations:**
+- All three established interfaces (API, SDK, Web) have E2E coverage.
+- Unit-level coverage exists only partially (SDK tracing decorators, Web atom tests).
+- API unit tests across all four boundary layers are the primary gap to fill.
+- Services and Docs interfaces are not yet established.
+
+---
+
+## Interface interaction model
+
+```
+Users ──────► Web ──────► API ──► Database
+                             │
+Developers ──► SDK ──────► API ──► Database
+                             │
+Workers ─────► Services ──► API ──► Database
+                             │
+Agents ──────► MCP ─────► API ──► Database (future)
+
+Docs site ──► Build + deploy pipeline (static)
+```
+
+The API is the central interface. SDK and Web tests that run against a live API implicitly exercise the API stack. This means:
+- API E2E tests validate the API in isolation.
+- SDK integration tests validate the SDK + API together.
+- Web E2E tests validate the Web + API together.
+
+When an SDK or Web E2E test fails, the root cause may be in the API layer. Cross-reference API E2E results when debugging.
+
+---
+
+## Adding a new interface
+
+When a new interface is added (e.g., MCP):
+
+1. Create `testing.interface.<name>.specs.md` following the structure of existing interface specs.
+2. Add a row to the interface matrix above.
+3. Identify which [boundaries](testing.boundaries.specs.md) apply.
+4. Add relevant [dimensions](testing.dimensions.specs.md) if the new interface introduces new filtering needs.
+5. Update [testing.structure.specs.md](testing.structure.specs.md) with the folder layout.
+6. Update [testing.running.specs.md](testing.running.specs.md) with execution commands.
+7. Update [README.md](README.md) with the new document link.
diff --git a/docs/designs/testing/testing.principles.specs.md b/docs/designs/testing/testing.principles.specs.md
new file mode 100644
index 0000000000..225c5c4cfe
--- /dev/null
+++ b/docs/designs/testing/testing.principles.specs.md
@@ -0,0 +1,91 @@
+# Testing Principles
+
+## Architecture context
+
+The Agenta API follows a ports-and-adapters (hexagonal) architecture with inversion of control:
+
+1. **Outbound adapters (DB)**: SQLAlchemy async engine (asyncpg driver) + DAO implementations per entity.
+2. **Core layer**: Defines DAO interfaces (ports). Core services receive port implementations.
+3. **Inbound adapters (HTTP)**: FastAPI routers receive Core services.
+4. **Composition root**: Wires everything together in `api/entrypoints/`.
+
+Dependencies flow inward:
+
+- Routers depend on Core services.
+- Core depends on ports (interfaces).
+- Adapters implement ports and depend on infrastructure (SQLAlchemy session/engine).
+- The composition root wires concrete implementations.
+
+This architecture applies most directly to the API. The principles of boundary isolation, mocking at seams, and E2E for real-dependency validation are universal across all components.
+
+## Test pyramid
+
+The target test pyramid has four layers, from fastest/most-isolated to slowest/most-integrated:
+
+1. **Utils/helpers** (pure unit) — Parsing, formatting, validators, normalizers. No dependencies, no mocking needed. Direct function calls, table-driven tests.
+2. **Core/business logic** (unit, mock ports) — Domain services tested with fake/mock implementations of their ports. Tests invariants, orchestration, domain error mapping.
+3. **Adapter unit** (unit, mock infrastructure) — Outbound adapters (DAO -> mock session) and inbound adapters (router -> mock services). Tests the adapter's own logic in isolation.
+4. **E2E/system** (real dependencies) — Full stack with real DB, real wiring. Validates cross-layer integration, infrastructure-specific semantics.
+
+No separate "integration test" layer exists for the API. The gap between unit and E2E is intentional.
+
+## Boundaries vs dimensions vs interfaces
+
+These are three orthogonal axes of the testing strategy:
+
+- **Boundaries** describe *where* in the architecture a test lives (which layer it exercises). See [testing.boundaries.specs.md](testing.boundaries.specs.md).
+- **Dimensions** describe *how* tests are filtered or categorized (markers, tags). See [testing.dimensions.specs.md](testing.dimensions.specs.md).
+- **Interfaces** describe *what system surface* is being tested (API, SDK, Web). See [testing.interfaces.specs.md](testing.interfaces.specs.md).
+
+A single test can be described along all three axes: it tests at the E2E boundary, is tagged as `coverage_smoke` and `path_happy`, and exercises the API interface.
+
+## Key strategic decisions
+
+1. **Unit tests use mocks/fakes, not running infrastructure.** No running Postgres, no running web servers, no DB emulators at the unit level.
+2. **One E2E suite per component.** Each interface (API, SDK, Web) has one E2E test suite that runs against real dependencies.
+3. **No separate integration test layer for the API.** The API strategy explicitly drops DAO-to-real-Postgres component tests. E2E is the only "real dependency" validation.
+4. **Fakes preferred over mocks.** When Core behavior depends on persistence state (create-then-fetch, idempotency, sequences), in-memory fake implementations of ports are preferred over mock objects. Mocks are reserved for interaction-only assertions (called once, called with specific args).
+
+## Tradeoff summary
+
+**Gains:**
+
+- Simplicity — fewer test categories to maintain.
+- Speed — unit tests are fast, no infrastructure spin-up.
+- Strong boundary testing — each layer is tested against its contract via mocks/fakes.
+
+**Costs:**
+
+- Fewer early signals for Postgres-specific issues (constraints, JSONB operators, ON CONFLICT behavior, type casting, locks).
+- Higher reliance on E2E to catch SQL/transaction/type/constraint semantics.
+- Potential "green unit tests, red E2E" when SQL is wrong or dialect-specific.
+
+**Mitigation:**
+
+- DAO unit assertions should focus on statement structure and bound parameters, not exact SQL strings.
+- The E2E suite should intentionally include tests that exercise Postgres-specific features the application relies on.
+
+## Mocking philosophy
+
+**Decision tree:**
+
+```
+Does the test need to verify state-dependent behavior?
+  (create -> fetch, idempotency, sequences)
+|-- YES -> Use a FAKE (in-memory implementation of the port)
+|           - Stores state in a dict/list
+|           - Supports create/read/update/delete
+|           - Returns realistic domain objects
++-- NO  -> Does the test verify an interaction?
+            (called once, called with specific args, called in order)
+    |-- YES -> Use a MOCK (unittest.mock or pytest-mock)
+    +-- NO  -> Direct function call (no test double needed)
+```
+
+**General rules:**
+
+- Mock/fake at the boundary, not deep inside the implementation.
+- Core tests mock ports (DAO interfaces, clock, id generators). Core tests never couple to SQLAlchemy types or HTTP DTOs.
+- DAO tests mock AsyncSession. Statements may optionally be compiled with the Postgres dialect for assertion.
+- Router tests mock Core services. FastAPI dependency overrides are used to inject test doubles.
+- E2E tests use real DI wiring. No mocking.
diff --git a/docs/designs/testing/testing.running.specs.md b/docs/designs/testing/testing.running.specs.md
new file mode 100644
index 0000000000..dccd9d1cb6
--- /dev/null
+++ b/docs/designs/testing/testing.running.specs.md
@@ -0,0 +1,216 @@
+# Running Tests
+
+This document describes how to run tests across all interfaces and execution environments. It covers the three execution modes (local-against-local, local-against-cloud, CI-against-cloud), environment variables, commands per interface, dimension-based filtering, and the CI pipeline strategy.
+
+For dimension/marker definitions, see [testing.dimensions.specs.md](testing.dimensions.specs.md).
+For per-interface details, see [testing.interface.api.specs.md](testing.interface.api.specs.md), [testing.interface.sdk.specs.md](testing.interface.sdk.specs.md), [testing.interface.web.specs.md](testing.interface.web.specs.md).
+
+---
+
+## Execution environments
+
+Tests can run in three modes, distinguished by where the tests execute and what backend they target.
+
+### Local against local
+
+All services run locally (via docker-compose or manual processes). Tests execute on the developer's machine and hit `localhost`.
+
+**When to use:** Day-to-day development, debugging, writing new tests.
+
+**Setup:**
+- Start the API and database locally (e.g., `docker-compose up`)
+- Set environment variables to point to local services
+- Run tests directly via pytest or pnpm
+
+### Local against cloud
+
+Tests execute on the developer's machine but hit a cloud or staging API.
+
+**When to use:** Validating SDK or Web behavior against a deployed environment without running the full stack locally.
+
+**Setup:**
+- Set `AGENTA_API_URL` / `AGENTA_HOST` to the cloud URL (e.g., `https://cloud.agenta.ai`)
+- Provide cloud credentials (`AGENTA_API_KEY`, `AGENTA_AUTH_KEY`)
+- Run tests directly via pytest or pnpm
+
+### CI against cloud
+
+Tests execute in GitHub Actions and target a cloud/staging environment.
+
+**When to use:** Automated quality gates on PRs and merges.
+
+**Setup:** Configured via GitHub Actions workflows with secrets for credentials and service containers for infrastructure.
+
+---
+
+## Environment variables
+
+Master table of all variables across all interfaces and modes:
+
+| Variable | Interface | Required | Default | Purpose |
+|----------|-----------|----------|---------|---------|
+| `AGENTA_API_URL` | API | Yes | -- | Base URL of the API under test |
+| `AGENTA_AUTH_KEY` | API | Yes | -- | Admin key for creating test accounts |
+| `AGENTA_HOST` | SDK | For integration | `https://cloud.agenta.ai` | API host for SDK tests |
+| `AGENTA_API_KEY` | SDK | For integration | -- | API key for SDK authentication |
+| `TESTMAIL_API_KEY` | Web E2E | Yes | -- | Testmail API key for email auth flows |
+| `TESTMAIL_NAMESPACE` | Web E2E | Yes | -- | Testmail namespace |
+| `AGENTA_OSS_OWNER_PASSWORD` | Web E2E (OSS) | Yes | -- | OSS owner account password |
+| `AGENTA_OSS_OWNER_EMAIL` | Web E2E (OSS) | Optional | -- | OSS owner email |
+| `NEXT_PUBLIC_AGENTA_API_URL` | Web data layer | Yes | -- | API URL for frontend tests |
+
+---
+
+## Commands by interface
+
+### API
+
+```bash
+# E2E tests (existing suite)
+cd api && pytest oss/tests/pytest/ -v
+
+# E2E tests with dimension filter
+cd api && pytest oss/tests/pytest/ -v -m "coverage_smoke and path_happy"
+
+# EE tests only
+cd api && pytest ee/tests/pytest/ -v
+
+# Future: unit tests
+cd api && pytest oss/tests/pytest/unit/ -v
+```
+
+### SDK
+
+**Current paths** (before migration):
+
+```bash
+# Unit tests
+cd sdk && pytest tests/unit/ -v
+
+# Integration tests (requires credentials)
+AGENTA_API_KEY=<key> AGENTA_HOST=<url> cd sdk && pytest tests/integration/ -v
+
+# Healthcheck tests
+cd sdk && pytest tests/pytest/ -v
+```
+
+**Target paths** (after migration to `tests/pytest/`):
+
+```bash
+# All SDK tests (unit + E2E, E2E skips if no credentials)
+cd sdk && pytest tests/pytest/ -v
+
+# Unit tests only
+cd sdk && pytest tests/pytest/unit/ -v
+
+# Unit tests with coverage
+cd sdk && pytest tests/pytest/unit/ --cov=agenta.sdk --cov-report=html
+
+# E2E tests only (requires credentials)
+AGENTA_API_KEY=<key> AGENTA_HOST=<url> cd sdk && pytest tests/pytest/e2e/ -v
+
+# Specific E2E domain
+AGENTA_API_KEY=<key> cd sdk && pytest tests/pytest/e2e/observability/ -v
+
+# Specific test class
+cd sdk && pytest tests/pytest/unit/test_tracing_decorators.py::TestGeneratorTracing -v
+```
+
+### Web
+
+```bash
+# E2E tests (from web/tests/)
+cd web/tests && pnpm test:e2e
+
+# E2E with UI mode
+cd web/tests && pnpm test:e2e:ui
+
+# E2E debug mode
+cd web/tests && pnpm test:e2e:debug
+
+# Data layer tests (from web/)
+cd web && pnpm test:datalayer
+
+# Individual data layer tests
+cd web && pnpm test:apps
+cd web && pnpm test:observability
+```
+
+---
+
+## Dimension-based filtering
+
+### Pytest (API/SDK)
+
+The `-m` flag filters by markers:
+
+```bash
+# Smoke tests only
+pytest -m coverage_smoke
+
+# Happy path smoke tests
+pytest -m "coverage_smoke and path_happy"
+
+# Functional tests for owner role
+pytest -m "lens_functional and role_owner"
+
+# Exclude slow tests
+pytest -m "not speed_slow"
+```
+
+Note: `coverage_full` is not a filter -- it means "run all tests" (no `-m` flag).
+
+### Playwright (Web)
+
+Dimension-specific CLI flags filter tests:
+
+```bash
+# Smoke tests
+pnpm test:e2e -- -coverage smoke
+
+# Happy path smoke tests
+pnpm test:e2e -- -coverage smoke -path happy
+
+# Specific scope
+pnpm test:e2e -- -scope playground
+
+# Functional tests for owner permission
+pnpm test:e2e -- -lens functional -permission owner
+```
+
+---
+
+## CI pipeline
+
+### Current state
+
+Only linting checks are active in CI:
+
+| Workflow | File | What it checks |
+|----------|------|---------------|
+| Python formatting | `.github/workflows/02-check-python-formatting.yml` | `ruff format` on `api/` and `sdk/` |
+| Python linting | `.github/workflows/03-check-python-linting.yml` | `ruff check` on `api/` and `sdk/` |
+| Frontend linting | `.github/workflows/04-check-frontend-linting.yml` | ESLint and Prettier on `web/` |
+
+No test execution workflows are currently active.
+
+### Target state
+
+| Trigger | What runs | Infrastructure | Coverage filter |
+|---------|-----------|---------------|----------------|
+| Every PR | API unit tests | None (pure Python) | All |
+| Every PR | SDK unit tests | None (pure Python) | All |
+| Every PR | Web component unit tests | None (Node.js) | All |
+| Merge to main | API E2E tests | Postgres (docker-compose) | `coverage_smoke` |
+| Merge to main | SDK integration tests | Running API + Postgres | `coverage_smoke` |
+| Merge to main | Web E2E tests | Running app + API + Postgres | `coverage_smoke` |
+| Nightly | API E2E tests | Postgres (docker-compose) | Full (no filter) |
+| Nightly | SDK integration tests | Running API + Postgres | Full (no filter) |
+| Nightly | Web E2E tests | Running app + API + Postgres | Full (no filter) |
+
+### Infrastructure requirements
+
+- **Postgres:** Service container or docker-compose for API E2E and SDK integration tests.
+- **API server:** Required for SDK integration and Web E2E (can run in-process or as container).
+- **Web app:** Required for Web E2E (Next.js dev server or built app).
+- **Credentials:** Stored as GitHub Actions secrets (`AGENTA_AUTH_KEY`, `AGENTA_API_KEY`, `TESTMAIL_API_KEY`, `TESTMAIL_NAMESPACE`).
diff --git a/docs/designs/testing/testing.structure.specs.md b/docs/designs/testing/testing.structure.specs.md
new file mode 100644
index 0000000000..1628bd1e09
--- /dev/null
+++ b/docs/designs/testing/testing.structure.specs.md
@@ -0,0 +1,371 @@
+# Testing Structure -- Folder Layout and File Types
+
+This document describes the physical organization of test files across the monorepo. It covers the organizing principle, test categories, standardized directory layouts, file naming, and handling of legacy and manual tests.
+
+For what to test at each architectural layer, see [testing.boundaries.specs.md](testing.boundaries.specs.md).
+For the five system interfaces, see [testing.interfaces.specs.md](testing.interfaces.specs.md) and the per-interface specs ([API](testing.interface.api.specs.md), [SDK](testing.interface.sdk.specs.md), [Web](testing.interface.web.specs.md)).
+
+---
+
+## Organizing principle
+
+Test files are organized by **test runner first, then by test type, then by domain**:
+
+```
+<component>/tests/
+  manual/                   # Not automated, developer reference (no fixed substructure)
+  legacy/                   # Old tests, not run, preserved for reference
+  <runner>/                 # pytest/ or playwright/
+    conftest.py             # Runner-level config and shared fixtures (pytest only)
+    e2e/                    # E2E tests organized by domain
+    unit/                   # Unit tests organized by boundary layer
+    utils/                  # Shared fixture modules
+```
+
+**Why runner at top level, not domain?**
+
+- CI pipelines invoke by runner (`pytest`, `playwright`), not by domain. A single `pytest` invocation sweeps all domains.
+- Runner config files (`conftest.py`, `playwright.config.ts`) naturally scope to the runner directory.
+- Putting runner inside domain (e.g., `annotations/{pytest/,manual/}`) would force N separate runner invocations and N separate configs.
+
+**License split (OSS/EE) stays at the component level.** Each component has `oss/tests/` and `ee/tests/` (except SDK which is OSS-only) because:
+- It matches source code organization (`oss/src/` vs `ee/src/`).
+- EE tests can depend on EE code.
+- OSS distribution can exclude `ee/` entirely.
+
+Within each license directory, the runner/type/domain hierarchy applies identically.
+
+**Standardization:** All interfaces follow this structure. Empty folders include `.gitkeep` files to ensure they're tracked by git.
+
+---
+
+## Folder semantics
+
+| Folder | Purpose | Testing mode | Execution |
+|--------|---------|--------------|-----------|
+| `manual/` | Freestyle tests and scripts in any format (`.http`, `.sh`, `.py`, `.ts`, `.curl`, etc.) | N/A | Not run automatically. Not in CI. No framework required. May be run manually by developers or agents. |
+| `legacy/` | Archived historical tests | N/A | Not run. Preserved for reference during migration. |
+| `pytest/` or `playwright/` | Framework-based automated tests | Follows tool's conventions | Run by pytest/playwright tool. Can be invoked by agents, humans, or CI. |
+| `e2e/` | End-to-end tests | **Black box** | System running behind it. Tests only interact with public surfaces (API URL, Web URL) using credentials. Full system integration. |
+| `unit/` | Unit tests | **White box** | System NOT running. Tests internal parts and layers using dependency injection and mocks. No external dependencies. |
+| `utils/` | Utilities and library tests | **White box** | Tests tools, libraries, internal benchmarks, and helper functions the system uses but that aren't part of the system itself. Gray line with `unit/`. |
+
+### Test file conventions
+
+| Type | Pattern | Example |
+|------|---------|---------|
+| Python test file | `test_*.py` | `test_workflows_basics.py` |
+| Python test class | `TestXxxBasics`, `TestXxxQueries` | `TestWorkflowsBasics` |
+| Playwright E2E | `*.spec.ts` | `create.spec.ts` |
+| Component unit (Web) | `*.test.ts` | `core.test.ts` |
+| Manual HTTP | `*.http` | `billing.http` |
+| Manual script | `*.sh`, `*.py`, `*.ts` | `smoke.http`, `test-apps.ts` |
+| Python conftest | `conftest.py` | Always this name |
+
+---
+
+## Standardized directory layout
+
+The following structure is now implemented and standardized across all interfaces.
+
+### API
+
+```
+api/
+  pytest.ini                              # Test config (testpaths: oss/tests/pytest, ee/tests/pytest)
+  oss/tests/
+    manual/                               # Manual tests (no fixed substructure)
+      annotations/crud.http
+      auth/admin.http
+      evaluations/*.http
+      testsets/*.http
+      tracing/*.http
+      workflows/*.http
+    legacy/                               # Legacy tests (NOT run, ~60 files, preserved for reference)
+      conftest.py, ...
+    pytest/
+      conftest.py                         # Root conftest (imports from utils/)
+      e2e/                                # E2E tests organized by domain (155 tests)
+        workflows/
+          test_workflows_basics.py
+          test_workflows_queries.py
+          test_workflows_retrieve.py
+          test_workflow_variants_basics.py
+          test_workflow_variants_queries.py
+          test_workflow_revisions_basics.py
+          test_workflow_revisions_queries.py
+          test_workflow_lineage.py
+        evaluations/
+          test_evaluation_runs_basics.py
+          test_evaluation_runs_queries.py
+          test_evaluation_scenarios_basics.py
+          test_evaluation_scenarios_queries.py
+          test_evaluation_steps_basics.py
+          test_evaluation_steps_queries.py
+          test_evaluation_metrics_basics.py
+          test_evaluation_metrics_queries.py
+        testsets/
+          test_testsets_basics.py
+          test_testsets_queries.py
+          test_testsets_files.py
+          test_testcases_basics.py
+        evaluators/
+          test_evaluators_basics.py
+          test_evaluators_queries.py
+        annotations/
+          test_annotations_basics.py
+          test_annotations_queries.py
+        tracing/
+          test_traces_basics.py
+          test_spans_basics.py
+          test_spans_queries.py
+        healthchecks/
+          test_healthchecks.py
+      unit/                               # Unit tests (.gitkeep placeholder)
+      utils/                              # Shared fixtures
+        api.py                            # authed_api, unauthed_api fixtures
+        accounts.py                       # cls_account, mod_account, foo_account fixtures
+        env.py                            # ag_env fixture (AGENTA_API_URL, AGENTA_AUTH_KEY)
+        constants.py                      # BASE_TIMEOUT = 10
+  ee/tests/
+    manual/                               # Manual tests
+      auth/*.http                         # Auth flow tests (discovery, policy, etc.)
+      billing.http
+      evaluations/sdk/*.py
+    legacy/                               # .gitkeep placeholder
+    pytest/
+      e2e/
+        test_billing_period.py            # Billing period E2E test
+      unit/                               # .gitkeep placeholder
+      utils/                              # .gitkeep placeholder
+```
+
+### SDK
+
+SDK is OSS-only (no EE split), so tests live directly under `sdk/tests/`.
+
+```
+sdk/
+  pytest.ini                              # Test config (testpaths: tests/pytest)
+  tests/
+    manual/                               # Manual tests
+      imports/*.py                        # Import and init tests
+      workflows/*.py                      # SDK workflow manual tests
+      tools/*.py                          # Tool invocation tests
+    legacy/                               # Legacy tests (NOT run, preserved for reference)
+      annotations/, baggage/, custom_workflows/, debugging/, management/, ...
+    pytest/
+      conftest.py
+      e2e/                                # SDK E2E tests (66 tests, against live API)
+        workflows/
+          test_apps_shared_manager.py
+          test_legacy_applications_manager.py
+        evaluations/
+          test_evaluations_flow.py
+        evaluators/
+          test_evaluators_manager.py
+        integrations/
+          test_prompt_template_storage.py
+          test_testsets_manager.py
+          test_vault_secrets.py
+        observability/
+          test_observability_traces.py
+        healthchecks/
+          test_healthchecks.py
+      unit/                               # Unit tests (22 tests, no external deps)
+        conftest.py
+        test_tracing_decorators.py
+      utils/                              # Shared fixtures
+        env.py                            # Environment variables
+        sdk.py                            # SDK client fixtures
+        accounts.py                       # Account management
+        constants.py                      # Test constants
+```
+
+### Web
+
+```
+web/
+  tests/                                  # Shared Playwright infrastructure
+    package.json                          # E2E scripts (test:e2e, test:e2e:ui, test:e2e:debug)
+    playwright.config.ts                  # Playwright configuration (testDir points to e2e/)
+    playwright/
+      config/
+        testTags.ts                       # Tag definitions and syntax
+        types.d.ts                        # Tag type definitions
+      global-setup.ts                     # Auth setup before all tests
+      global-teardown.ts                  # Cleanup after all tests
+      scripts/
+        run-tests.ts                      # Test runner script
+      utils/                              # .gitkeep placeholder
+    tests/
+      fixtures/
+        base.fixture/                     # apiHelpers, uiHelpers, llmKeysSettingsHelpers
+        user.fixture/                     # authHelpers (email/password/OTP flows)
+        session.fixture/                  # Browser session management
+    guides/
+      E2E_TEST_GENERATION_GUIDE.md
+      E2E_TEST_ORGANIZATION_GUIDE.md
+      UTILITIES_AND_FIXTURES_GUIDE.md
+      RECORDING_GUIDE.md
+  oss/tests/
+    manual/                               # Manual tests
+      datalayer/
+        test-apps.ts                      # Data layer integration tests
+        test-observability.ts
+    legacy/                               # .gitkeep placeholder
+    playwright/
+      e2e/                                # E2E test suites organized by feature
+        settings/
+        app/
+        playground/
+        prompt-registry/
+        testsset/
+        observability/
+        deployment/
+        smoke.spec.ts                     # Smoke test
+      unit/                               # .gitkeep placeholder
+      utils/                              # .gitkeep placeholder
+  ee/tests/
+    manual/                               # .gitkeep placeholder
+    legacy/                               # .gitkeep placeholder
+    playwright/
+      e2e/                                # EE E2E test suites
+        settings/
+        app/
+        playground/
+        prompt-registry/
+        testsset/
+        auto-evaluation/
+        observability/
+        deployment/
+        human-annotation/
+      unit/                               # .gitkeep placeholder
+      utils/                              # .gitkeep placeholder
+  oss/src/components/Playground/state/atoms/__tests__/
+    core.test.ts                          # Component unit test (colocated with source)
+```
+
+### Services
+
+Services follows the same standardized structure as API and SDK.
+
+```
+services/
+  oss/tests/
+    manual/                               # Manual tests
+      smoke.http                          # Existing smoke test
+    legacy/                               # .gitkeep placeholder
+    pytest/
+      e2e/                                # .gitkeep placeholder (ready for E2E tests)
+      unit/                               # .gitkeep placeholder (ready for unit tests)
+      utils/                              # .gitkeep placeholder (ready for fixtures)
+  ee/tests/
+    manual/                               # .gitkeep placeholder
+    legacy/                               # .gitkeep placeholder
+    pytest/
+      e2e/                                # .gitkeep placeholder
+      unit/                               # .gitkeep placeholder
+      utils/                              # .gitkeep placeholder
+```
+
+Services currently has minimal test coverage (one manual smoke test). The structure is in place and ready for expansion as services testing grows.
+
+---
+
+## Future expansion
+
+### Unit test organization
+
+When unit tests are added, they should be organized by [boundary layer](testing.boundaries.specs.md):
+
+```
+pytest/unit/
+  utils/                                  # Layer 1: Pure functions
+    test_*.py
+  core/                                   # Layer 2: Business logic with mocked ports
+    test_*.py
+  adapters/
+    db/                                   # Layer 3: DAO with mocked session
+      test_*.py
+    http/                                 # Layer 4: Routers with in-process client
+      test_*.py
+```
+
+### Component unit tests (Web)
+
+Web component unit tests remain **colocated with source code** in `__tests__/` directories:
+
+```
+web/oss/src/
+  components/<Feature>/state/atoms/__tests__/*.test.ts
+  lib/helpers/__tests__/*.test.ts
+```
+
+This keeps unit tests close to the code they test and allows for fast feedback during development.
+
+---
+
+## Understanding the test folder types
+
+### manual/ -- Freestyle, no framework
+
+The `manual/` folder accepts any kind of scripts or documentation. It's **freestyle** -- no required format, no required framework, no hard-coded checks. Files may include:
+- `.http` files (REST client format)
+- `.sh` shell scripts with curl commands
+- `.py` Python scripts
+- `.ts` / `.js` TypeScript/JavaScript scripts
+- `.curl` curl command files
+- `.md` documentation
+
+**Key characteristics:**
+- Not run automatically
+- Not in CI
+- No framework required
+- May be run manually by developers or agents
+- Useful for ad-hoc testing, reproducing issues, or developer reference
+
+**Examples:**
+- `api/oss/tests/manual/annotations/crud.http` -- Manual CRUD operations
+- `api/ee/tests/manual/auth/*.http` -- Auth flow testing
+- `web/oss/tests/manual/datalayer/*.ts` -- Data layer integration tests (run manually with tsx)
+
+### legacy/ -- Archived tests
+
+Historical tests preserved for reference during migration. **Not run.** May be deleted once migration is complete.
+
+### e2e/ -- Black box, system running
+
+End-to-end tests that treat the system as a **black box**. Expects a running system behind it (API server, web server, database, etc.). Tests only interact with public surfaces using credentials:
+- API E2E: HTTP requests to API endpoints (`AGENTA_API_URL`, `AGENTA_AUTH_KEY`)
+- SDK E2E: SDK client calls against live API (`AGENTA_HOST`, `AGENTA_API_KEY`)
+- Web E2E: Playwright browser tests against running web app (`AGENTA_WEB_URL`)
+
+**No access to internals.** Tests validate behavior from the outside.
+
+### unit/ -- White box, system NOT running
+
+Unit tests that test **internal parts and layers** of the system. The system is **NOT running** -- no servers, no databases, no external dependencies. Uses:
+- Dependency injection
+- Mocked ports and adapters
+- In-memory fakes
+- Direct function/class invocation
+
+Tests are organized by [boundary layer](testing.boundaries.specs.md):
+- `unit/utils/` -- Pure functions (parsing, formatting, validation)
+- `unit/core/` -- Business logic with mocked ports
+- `unit/adapters/db/` -- DAO with mocked database session
+- `unit/adapters/http/` -- HTTP routers with in-process test client
+
+### utils/ -- Testing the tools themselves
+
+Tests for **libraries, tools, and helper functions** that the system uses but that aren't part of the system's core business logic. Examples:
+- Testing a shared validation library
+- Testing internal benchmark utilities
+- Testing helper functions with boundary cases
+
+There's a **gray line** between `unit/utils/` (pure business utilities) and `utils/` (tooling utilities). When in doubt:
+- If it's business domain logic → `unit/utils/`
+- If it's infrastructure/tooling → `utils/`
+
+The `utils/` folder may also contain **shared test fixtures** (conftest helpers, account management, API clients) used by `e2e/` and `unit/` tests.
diff --git a/docs/docs/self-host/guides/03-deploy-to-kubernetes.mdx b/docs/docs/self-host/guides/03-deploy-to-kubernetes.mdx
index 499d1268e1..dfc411a742 100644
--- a/docs/docs/self-host/guides/03-deploy-to-kubernetes.mdx
+++ b/docs/docs/self-host/guides/03-deploy-to-kubernetes.mdx
@@ -2,6 +2,6 @@
 title: 'Deploy on Kubernetes'
 ---
 
-For the moment Kubernetes deployment is only available part of our Enterprise Edition. Agenta Enterprise is the best way to self-host Agenta. It is highly scalable and the data never leaves your environment. It provides the tools to manage multiple users and teams all in one place.
+For the moment Kubernetes deployment is only available part of our Enterprise Edition (EE). Agenta Enterprise is the best way to self-host Agenta. It is highly scalable and the data never leaves your environment. It provides the tools to manage multiple users and teams all in one place.
 
 Agenta Enterprise is an early access stage for select partners. [Reach out](https://cal.com/mahmoud-mabrouk-ogzgey/demo) to inquire for more details. 
diff --git a/docs/drafts/security/sso-providers.mdx b/docs/drafts/security/sso-providers.mdx
index f5ee70028c..34030bc086 100644
--- a/docs/drafts/security/sso-providers.mdx
+++ b/docs/drafts/security/sso-providers.mdx
@@ -122,7 +122,7 @@ Organizations typically progress through these phases when adopting SSO:
 
 Before configuring SSO in Agenta:
 
-1. ✅ Agenta Enterprise Edition license
+1. ✅ Agenta Enterprise Edition (EE) license
 2. ✅ Organization owner or admin role
 3. ✅ Access to your identity provider (Okta, Azure AD, etc.)
 4. ✅ At least a Business subscription (if you use our managed offering)
diff --git a/sdk/pytest.ini b/sdk/pytest.ini
index 69ca41b535..4effdb546a 100644
--- a/sdk/pytest.ini
+++ b/sdk/pytest.ini
@@ -22,4 +22,9 @@ markers =
     case_typical: likely behavior
     case_edge: unlikely behavior
     speed_fast: ~ milliseconds
-    speed_slow: ~ seconds
\ No newline at end of file
+    speed_slow: ~ seconds
+    license_oss: OSS license scope
+    license_ee: EE license scope
+    cost_free: no monetary cost (local/internal services)
+    cost_paid: uses paid third-party services (LLM APIs)
+    e2e: requires running API (AGENTA_API_URL, AGENTA_AUTH_KEY)
\ No newline at end of file
diff --git a/sdk/tests/integration/__init__.py b/sdk/tests/integration/__init__.py
deleted file mode 100644
index 6dbbb8df96..0000000000
--- a/sdk/tests/integration/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Integration tests for the Agenta SDK.
-
-These tests make REAL API calls to validate the SDK managers work correctly
-with the Agenta backend API.
-
-Run with: pytest sdk/tests/integration/ -v -m integration
-
-Environment variables:
-- AGENTA_HOST: API host URL (default: https://cloud.agenta.ai)
-- AGENTA_API_KEY: API key for authentication (required)
-"""
diff --git a/sdk/tests/pytest/conftest.py b/sdk/tests/pytest/conftest.py
index b4e40b7f18..004485d574 100644
--- a/sdk/tests/pytest/conftest.py
+++ b/sdk/tests/pytest/conftest.py
@@ -1,7 +1,3 @@
-from tests.pytest.utils.env import ag_env  # noqa: F401
-from tests.pytest.utils.sdk import ag_sdk  # noqa: F401
-from tests.pytest.utils.accounts import (
-    foo_account,  # noqa: F401
-    cls_account,  # noqa: F401
-    mod_account,  # noqa: F401
-)
+# Root conftest for SDK tests.
+# Intentionally minimal — e2e fixtures are scoped to tests/pytest/e2e/.
+# Unit tests must not require environment variables or running services.
diff --git a/sdk/tests/integration/vault/__init__.py b/sdk/tests/pytest/e2e/__init__.py
similarity index 100%
rename from sdk/tests/integration/vault/__init__.py
rename to sdk/tests/pytest/e2e/__init__.py
diff --git a/sdk/tests/integration/conftest.py b/sdk/tests/pytest/e2e/conftest.py
similarity index 78%
rename from sdk/tests/integration/conftest.py
rename to sdk/tests/pytest/e2e/conftest.py
index ff92c3982a..ee27518a13 100644
--- a/sdk/tests/integration/conftest.py
+++ b/sdk/tests/pytest/e2e/conftest.py
@@ -1,15 +1,19 @@
 """
-Shared fixtures for Agenta SDK integration tests.
+Shared fixtures for Agenta SDK E2E tests.
 
 These fixtures provide:
-- API credentials management with environment variable support
-- SDK initialization
+- Account creation via the admin API (same flow as API E2E tests)
+- SDK initialization with auto-provisioned credentials
 - Test resource creation and cleanup (apps, variants)
+
+Environment variables required:
+- AGENTA_API_URL: e.g. http://localhost:10180/api
+- AGENTA_AUTH_KEY: e.g. change-me-auth
 """
 
 import os
 from uuid import uuid4
-from typing import Generator, Tuple, Optional, Any
+from typing import Generator, Optional, Any
 
 import pytest
 
@@ -17,55 +21,53 @@
 from agenta.sdk.managers.apps import AppManager
 from agenta.sdk.managers.shared import SharedManager
 
-
-DEFAULT_HOST = "https://cloud.agenta.ai"
-
-
-def get_api_credentials() -> Tuple[str, Optional[str]]:
-    """
-    Get API credentials from environment variables.
-
-    Returns:
-        Tuple of (host, api_key). api_key may be None if missing.
-    """
-    host = os.getenv("AGENTA_HOST", DEFAULT_HOST)
-    api_key = os.getenv("AGENTA_API_KEY")
-    return host, api_key
+from tests.pytest.utils.env import get_ag_env
+from tests.pytest.utils.accounts import create_account
 
 
-def credentials_available() -> bool:
-    """Check if credentials are available from environment variables."""
-    host, api_key = get_api_credentials()
-    return bool(api_key)
+def _env_available() -> bool:
+    """Check if the required env vars are set."""
+    return bool(os.getenv("AGENTA_API_URL")) and bool(os.getenv("AGENTA_AUTH_KEY"))
 
 
 @pytest.fixture(autouse=True)
-def _skip_integration_if_missing_credentials(request):
-    if request.node.get_closest_marker("integration") and not credentials_available():
-        pytest.skip("API credentials not available (set AGENTA_API_KEY)")
+def _skip_e2e_if_missing_env(request):
+    if request.node.get_closest_marker("e2e") and not _env_available():
+        pytest.skip("E2E env not available (set AGENTA_API_URL and AGENTA_AUTH_KEY)")
 
 
-# Skip marker for tests that require credentials
-requires_credentials = pytest.mark.skipif(
-    not credentials_available(),
-    reason="API credentials not available (set AGENTA_API_KEY; AGENTA_HOST optional)",
-)
+@pytest.fixture(scope="session")
+def ag_env():
+    """Session-scoped environment (reads AGENTA_API_URL / AGENTA_AUTH_KEY)."""
+    return get_ag_env()
 
 
 @pytest.fixture(scope="session")
-def api_credentials() -> Tuple[str, str]:
+def e2e_account(ag_env):
     """
-    Fixture that provides API credentials.
+    Create a test account via POST /admin/account (session-scoped).
 
     Returns:
-        Tuple of (host, api_key)
+        Dict with 'api_url' and 'credentials' keys.
+        credentials is a string like "ApiKey <key>".
+    """
+    return create_account(ag_env)
+
+
+@pytest.fixture(scope="session")
+def api_credentials(e2e_account) -> tuple:
+    """
+    Derive (host, api_key) from the account credentials.
 
-    Skips the test if no credentials are available.
+    - host: api_url with the trailing '/api' stripped
+    - api_key: credentials with the 'ApiKey ' prefix stripped
     """
-    host, api_key = get_api_credentials()
-    if not api_key or not api_key.strip():
-        pytest.skip("API credentials not available (set AGENTA_API_KEY)")
-    assert api_key is not None
+    api_url = e2e_account["api_url"]
+    credentials = e2e_account["credentials"]
+
+    host = api_url[:-4]  # strip '/api'
+    api_key = credentials[7:]  # strip 'ApiKey '
+
     return host, api_key
 
 
@@ -135,7 +137,7 @@ def _force_reinit_sdk(host: str, api_key: str) -> None:
 
 
 @pytest.fixture(scope="function")
-def agenta_init(api_credentials: Tuple[str, str]) -> Generator[None, None, None]:
+def agenta_init(api_credentials: tuple) -> Generator[None, None, None]:
     """
     Initialize the Agenta SDK with test credentials.
 
diff --git a/sdk/tests/pytest/healthchecks/__init__.py b/sdk/tests/pytest/e2e/evaluations/__init__.py
similarity index 100%
rename from sdk/tests/pytest/healthchecks/__init__.py
rename to sdk/tests/pytest/e2e/evaluations/__init__.py
diff --git a/sdk/tests/integration/evaluations/test_evaluations_flow.py b/sdk/tests/pytest/e2e/evaluations/test_evaluations_flow.py
similarity index 98%
rename from sdk/tests/integration/evaluations/test_evaluations_flow.py
rename to sdk/tests/pytest/e2e/evaluations/test_evaluations_flow.py
index 7181a848d9..d5b33aeb58 100644
--- a/sdk/tests/integration/evaluations/test_evaluations_flow.py
+++ b/sdk/tests/pytest/e2e/evaluations/test_evaluations_flow.py
@@ -22,7 +22,7 @@
 
 from agenta.sdk.evaluations import metrics, results, runs, scenarios
 
-pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
+pytestmark = [pytest.mark.e2e, pytest.mark.asyncio]
 
 
 async def test_evaluations_run_scenario_result_close(agenta_init):
diff --git a/sdk/tests/pytest/e2e/healthchecks/__init__.py b/sdk/tests/pytest/e2e/healthchecks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sdk/tests/pytest/e2e/healthchecks/conftest.py b/sdk/tests/pytest/e2e/healthchecks/conftest.py
new file mode 100644
index 0000000000..b4e40b7f18
--- /dev/null
+++ b/sdk/tests/pytest/e2e/healthchecks/conftest.py
@@ -0,0 +1,7 @@
+from tests.pytest.utils.env import ag_env  # noqa: F401
+from tests.pytest.utils.sdk import ag_sdk  # noqa: F401
+from tests.pytest.utils.accounts import (
+    foo_account,  # noqa: F401
+    cls_account,  # noqa: F401
+    mod_account,  # noqa: F401
+)
diff --git a/sdk/tests/pytest/healthchecks/test_healthchecks.py b/sdk/tests/pytest/e2e/healthchecks/test_healthchecks.py
similarity index 100%
rename from sdk/tests/pytest/healthchecks/test_healthchecks.py
rename to sdk/tests/pytest/e2e/healthchecks/test_healthchecks.py
diff --git a/sdk/tests/pytest/e2e/integrations/__init__.py b/sdk/tests/pytest/e2e/integrations/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sdk/tests/integration/evaluators/test_evaluators_manager.py b/sdk/tests/pytest/e2e/integrations/test_evaluators_manager.py
similarity index 96%
rename from sdk/tests/integration/evaluators/test_evaluators_manager.py
rename to sdk/tests/pytest/e2e/integrations/test_evaluators_manager.py
index ad2eefbe90..6da0f7e25f 100644
--- a/sdk/tests/integration/evaluators/test_evaluators_manager.py
+++ b/sdk/tests/pytest/e2e/integrations/test_evaluators_manager.py
@@ -19,7 +19,7 @@
 
 from agenta.sdk.managers import evaluators
 
-pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
+pytestmark = [pytest.mark.e2e, pytest.mark.asyncio]
 
 
 def _evaluator_handler(prediction: str, reference: str) -> float:
diff --git a/sdk/tests/integration/prompts/test_prompt_template_storage.py b/sdk/tests/pytest/e2e/integrations/test_prompt_template_storage.py
similarity index 97%
rename from sdk/tests/integration/prompts/test_prompt_template_storage.py
rename to sdk/tests/pytest/e2e/integrations/test_prompt_template_storage.py
index 464b8b92c6..8242b101ef 100644
--- a/sdk/tests/integration/prompts/test_prompt_template_storage.py
+++ b/sdk/tests/pytest/e2e/integrations/test_prompt_template_storage.py
@@ -3,7 +3,7 @@
 from agenta.sdk.managers.shared import SharedManager
 from agenta.sdk.types import Message, PromptTemplate
 
-pytestmark = [pytest.mark.integration]
+pytestmark = [pytest.mark.e2e]
 
 
 def test_prompt_template_messages_roundtrip_in_variant_config(
diff --git a/sdk/tests/integration/testsets/test_testsets_manager.py b/sdk/tests/pytest/e2e/integrations/test_testsets_manager.py
similarity index 98%
rename from sdk/tests/integration/testsets/test_testsets_manager.py
rename to sdk/tests/pytest/e2e/integrations/test_testsets_manager.py
index c6d45110a7..9a6c534727 100644
--- a/sdk/tests/integration/testsets/test_testsets_manager.py
+++ b/sdk/tests/pytest/e2e/integrations/test_testsets_manager.py
@@ -20,7 +20,7 @@
 
 from agenta.sdk.managers import testsets
 
-pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
+pytestmark = [pytest.mark.e2e, pytest.mark.asyncio]
 
 
 async def test_testsets_upsert_fetch_edit_list_retrieve(
diff --git a/sdk/tests/integration/vault/test_vault_secrets.py b/sdk/tests/pytest/e2e/integrations/test_vault_secrets.py
similarity index 99%
rename from sdk/tests/integration/vault/test_vault_secrets.py
rename to sdk/tests/pytest/e2e/integrations/test_vault_secrets.py
index ec78b9856f..d13b383d39 100644
--- a/sdk/tests/integration/vault/test_vault_secrets.py
+++ b/sdk/tests/pytest/e2e/integrations/test_vault_secrets.py
@@ -21,7 +21,7 @@
 )
 
 
-pytestmark = [pytest.mark.integration]
+pytestmark = [pytest.mark.e2e]
 
 
 class TestAccessControlPermissions:
diff --git a/sdk/tests/pytest/e2e/observability/__init__.py b/sdk/tests/pytest/e2e/observability/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sdk/tests/integration/tracing/test_observability_traces.py b/sdk/tests/pytest/e2e/observability/test_observability_traces.py
similarity index 98%
rename from sdk/tests/integration/tracing/test_observability_traces.py
rename to sdk/tests/pytest/e2e/observability/test_observability_traces.py
index c06f11fabc..b8c4f7fc2f 100644
--- a/sdk/tests/integration/tracing/test_observability_traces.py
+++ b/sdk/tests/pytest/e2e/observability/test_observability_traces.py
@@ -20,7 +20,7 @@
 import agenta as ag
 
 
-pytestmark = [pytest.mark.integration]
+pytestmark = [pytest.mark.e2e]
 
 
 def test_observability_trace_lifecycle(agenta_init, otlp_flat_span_factory):
@@ -108,7 +108,7 @@ def test_observability_trace_lifecycle(agenta_init, otlp_flat_span_factory):
             pass
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 @pytest.mark.asyncio
 class TestObservabilityAsync:
     """Test async observability API."""
diff --git a/sdk/tests/pytest/e2e/workflows/__init__.py b/sdk/tests/pytest/e2e/workflows/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sdk/tests/integration/applications/test_apps_shared_manager.py b/sdk/tests/pytest/e2e/workflows/test_apps_shared_manager.py
similarity index 97%
rename from sdk/tests/integration/applications/test_apps_shared_manager.py
rename to sdk/tests/pytest/e2e/workflows/test_apps_shared_manager.py
index 24c6ba2957..9f1586747e 100644
--- a/sdk/tests/integration/applications/test_apps_shared_manager.py
+++ b/sdk/tests/pytest/e2e/workflows/test_apps_shared_manager.py
@@ -26,7 +26,7 @@
 from agenta.sdk.types import ConfigurationResponse, DeploymentResponse
 
 # Mark all tests in this module as integration tests
-pytestmark = [pytest.mark.integration]
+pytestmark = [pytest.mark.e2e]
 
 
 def cleanup_app_safe(app_id: str) -> None:
@@ -64,7 +64,7 @@ def generate_unique_slug(prefix: str = "test") -> str:
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 class TestAppManagerSync:
     """Test AppManager synchronous methods with real API calls."""
 
@@ -209,7 +209,7 @@ def test_create_list_delete_workflow(self, agenta_init):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 @pytest.mark.asyncio
 class TestAppManagerAsync:
     """Test AppManager asynchronous methods with real API calls."""
@@ -287,7 +287,7 @@ async def test_async_create_list_workflow(self, agenta_init):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 class TestSharedManagerSync:
     """Test SharedManager synchronous methods with real API calls."""
 
@@ -314,8 +314,8 @@ def test_add_variant(self, agenta_init, test_app):
 
             # Verify field values
             assert_not_none(result.variant_id, "variant_id should not be None")
-            assert result.variant_slug == variant_slug, (
-                f"variant_slug should match: expected {variant_slug}, got {result.variant_slug}"
+            assert result.variant_slug.endswith(variant_slug), (
+                f"variant_slug should end with {variant_slug}, got {result.variant_slug}"
             )
 
         finally:
@@ -338,8 +338,8 @@ def test_fetch_variant(self, agenta_init, test_variant):
             f"fetch() should return ConfigurationResponse, got {type(result)}"
         )
 
-        # Verify we got the right variant
-        assert result.variant_slug == test_variant["variant_slug"]
+        # Verify we got the right variant (API returns fully-qualified slug)
+        assert result.variant_slug.endswith(test_variant["variant_slug"])
         assert_has_attr(result, "params")
 
     def test_fetch_variant_by_id(self, agenta_init, test_variant):
@@ -520,7 +520,7 @@ def test_complete_variant_workflow(self, agenta_init, test_app):
             # List configs
             list_result = SharedManager.list(app_id=test_app["app_id"])
             assert isinstance(list_result, list)
-            assert any(c.variant_slug == variant_slug for c in list_result)
+            assert any(c.variant_slug.endswith(variant_slug) for c in list_result)
 
             # History
             history_result = SharedManager.history(
@@ -559,7 +559,7 @@ def test_complete_variant_workflow(self, agenta_init, test_app):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 @pytest.mark.asyncio
 class TestSharedManagerAsync:
     """Test SharedManager asynchronous methods with real API calls."""
@@ -593,7 +593,7 @@ async def test_afetch_variant(self, agenta_init, test_variant):
 
         assert_not_none(result, "afetch() should return a response")
         assert isinstance(result, ConfigurationResponse)
-        assert result.variant_slug == test_variant["variant_slug"]
+        assert result.variant_slug.endswith(test_variant["variant_slug"])
 
     async def test_alist_configs(self, agenta_init, test_variant):
         """Test listing configs via SharedManager.alist()."""
@@ -741,7 +741,7 @@ async def test_async_complete_workflow(self, agenta_init, test_app):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 class TestResponseSerialization:
     """Test that API responses can be properly serialized/deserialized."""
 
@@ -810,7 +810,7 @@ def test_app_response_structure(self, agenta_init, test_app):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 class TestErrorHandling:
     """Test error handling for invalid API calls."""
 
@@ -843,7 +843,7 @@ def test_delete_nonexistent_app(self, agenta_init):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 class TestSharedManagerValidation:
     """Test parameter validation in SharedManager."""
 
@@ -881,7 +881,7 @@ def test_fetch_environment_version_without_slug_raises(self, agenta_init):
 # =============================================================================
 
 
-@pytest.mark.integration
+@pytest.mark.e2e
 @pytest.mark.asyncio
 class TestConcurrentOperations:
     """Test concurrent async operations."""
@@ -909,4 +909,4 @@ async def test_concurrent_config_fetch(self, agenta_init, test_variant):
         # All results should be ConfigurationResponse
         for result in results:
             assert isinstance(result, ConfigurationResponse)
-            assert result.variant_slug == test_variant["variant_slug"]
+            assert result.variant_slug.endswith(test_variant["variant_slug"])
diff --git a/sdk/tests/integration/applications/test_legacy_applications_manager.py b/sdk/tests/pytest/e2e/workflows/test_legacy_applications_manager.py
similarity index 75%
rename from sdk/tests/integration/applications/test_legacy_applications_manager.py
rename to sdk/tests/pytest/e2e/workflows/test_legacy_applications_manager.py
index 3de0c78f5c..983eef5722 100644
--- a/sdk/tests/integration/applications/test_legacy_applications_manager.py
+++ b/sdk/tests/pytest/e2e/workflows/test_legacy_applications_manager.py
@@ -15,21 +15,34 @@
     AGENTA_HOST: Optional, defaults to https://cloud.agenta.ai
 """
 
+import asyncio
+
 import pytest
 
 from agenta.sdk.managers import applications
 
-pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
+pytestmark = [pytest.mark.e2e, pytest.mark.asyncio]
 
 
 def _legacy_application_handler(prompt: str) -> str:
     return prompt
 
 
+async def _aupsert_with_retry(*, max_retries=3, delay=2.0, **kwargs):
+    """Retry aupsert on 429 rate limit errors."""
+    for attempt in range(max_retries):
+        result = await applications.aupsert(**kwargs)
+        if result is not None:
+            return result
+        if attempt < max_retries - 1:
+            await asyncio.sleep(delay * (attempt + 1))
+    return None
+
+
 async def test_legacy_applications_upsert_retrieve_update(
     deterministic_legacy_application_slug: str, agenta_init
 ):
-    rev1_id = await applications.aupsert(
+    rev1_id = await _aupsert_with_retry(
         application_slug=deterministic_legacy_application_slug,
         name="SDK IT Legacy App v1",
         description="SDK integration test legacy application",
@@ -46,7 +59,7 @@ async def test_legacy_applications_upsert_retrieve_update(
     assert dumped.get("id")
     assert dumped.get("application_id")
 
-    rev2_id = await applications.aupsert(
+    rev2_id = await _aupsert_with_retry(
         application_slug=deterministic_legacy_application_slug,
         name="SDK IT Legacy App v1",
         description="SDK integration test legacy application (updated)",
diff --git a/sdk/tests/unit/README.md b/sdk/tests/pytest/unit/README.md
similarity index 100%
rename from sdk/tests/unit/README.md
rename to sdk/tests/pytest/unit/README.md
diff --git a/sdk/tests/unit/TESTING_PATTERNS.md b/sdk/tests/pytest/unit/TESTING_PATTERNS.md
similarity index 100%
rename from sdk/tests/unit/TESTING_PATTERNS.md
rename to sdk/tests/pytest/unit/TESTING_PATTERNS.md
diff --git a/sdk/tests/unit/__init__.py b/sdk/tests/pytest/unit/__init__.py
similarity index 100%
rename from sdk/tests/unit/__init__.py
rename to sdk/tests/pytest/unit/__init__.py
diff --git a/sdk/tests/unit/conftest.py b/sdk/tests/pytest/unit/conftest.py
similarity index 100%
rename from sdk/tests/unit/conftest.py
rename to sdk/tests/pytest/unit/conftest.py
diff --git a/sdk/tests/unit/test_tracing_decorators.py b/sdk/tests/pytest/unit/test_tracing_decorators.py
similarity index 96%
rename from sdk/tests/unit/test_tracing_decorators.py
rename to sdk/tests/pytest/unit/test_tracing_decorators.py
index b765f8ebcb..1b9bb8b343 100644
--- a/sdk/tests/unit/test_tracing_decorators.py
+++ b/sdk/tests/pytest/unit/test_tracing_decorators.py
@@ -71,6 +71,8 @@ def setup_method(self):
         # Set up mock_tracing for _post_instrument calls
         self.mock_tracing = Mock()
         self.mock_tracing.get_current_span.return_value = self.mock_span
+        # _redact checks `ag.tracing.redact is not None` — must be None to skip
+        self.mock_tracing.redact = None
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_sync_function_basic(self, mock_ag):
@@ -95,7 +97,7 @@ def simple_function(x, y):
         assert call_args[1]["name"] == "simple_function"
 
         # Verify span was set to OK status
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @pytest.mark.asyncio
     @patch("agenta.sdk.decorators.tracing.ag")
@@ -122,7 +124,7 @@ async def simple_async_function(x, y):
         assert call_args[1]["name"] == "simple_async_function"
 
         # Verify span was set to OK status
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_sync_function_with_exception(self, mock_ag):
@@ -265,6 +267,8 @@ def setup_method(self):
         # Set up mock_tracing for _post_instrument calls
         self.mock_tracing = Mock()
         self.mock_tracing.get_current_span.return_value = self.mock_span
+        # _redact checks `ag.tracing.redact is not None` — must be None to skip
+        self.mock_tracing.redact = None
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_sync_generator_basic(self, mock_ag):
@@ -291,7 +295,7 @@ def simple_generator():
         assert call_args[1]["name"] == "simple_generator"
 
         # Verify span was set to OK status
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_sync_generator_with_return_value(self, mock_ag):
@@ -393,7 +397,7 @@ async def simple_async_generator():
         assert call_args[1]["name"] == "simple_async_generator"
 
         # Verify span was set to OK status
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @pytest.mark.asyncio
     @patch("agenta.sdk.decorators.tracing.ag")
@@ -599,7 +603,7 @@ def large_generator():
 
         # Verify span was created
         mock_ag.tracer.start_as_current_span.assert_called_once()
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @pytest.mark.asyncio
     @patch("agenta.sdk.decorators.tracing.ag")
@@ -625,7 +629,7 @@ async def delayed_generator():
 
         # Verify span was created
         mock_ag.tracer.start_as_current_span.assert_called_once()
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_generator_with_mixed_types(self, mock_ag):
@@ -651,7 +655,7 @@ def mixed_type_generator():
 
         # Verify span was created
         mock_ag.tracer.start_as_current_span.assert_called_once()
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
 
     @patch("agenta.sdk.decorators.tracing.ag")
     def test_generator_with_decorator_parameters(self, mock_ag):
@@ -679,4 +683,4 @@ def parameterized_generator(prompt):
         assert call_args[1]["name"] == "parameterized_generator"
 
         # Verify span was set to OK status
-        self.mock_span.set_status.assert_called_with("OK")
+        self.mock_span.set_status.assert_called_with(status="OK", description=None)
diff --git a/web/ee/tests/2-app/create.spec.ts b/web/ee/tests/2-app/create.spec.ts
deleted file mode 100644
index de0137e3cd..0000000000
--- a/web/ee/tests/2-app/create.spec.ts
+++ /dev/null
@@ -1,5 +0,0 @@
-import tests, {test} from "@agenta/oss/tests/2-app"
-
-test.describe(`EE App Creation Flow`, () => {
-    tests()
-})
diff --git a/web/ee/tests/playwright/e2e/app/create.spec.ts b/web/ee/tests/playwright/e2e/app/create.spec.ts
new file mode 100644
index 0000000000..92d6e2e451
--- /dev/null
+++ b/web/ee/tests/playwright/e2e/app/create.spec.ts
@@ -0,0 +1,5 @@
+import tests, {test} from "@agenta/oss/tests/playwright/2-app"
+
+test.describe(`EE App Creation Flow`, () => {
+    tests()
+})
diff --git a/web/ee/tests/6-auto-evaluation/assets/README.md b/web/ee/tests/playwright/e2e/auto-evaluation/assets/README.md
similarity index 100%
rename from web/ee/tests/6-auto-evaluation/assets/README.md
rename to web/ee/tests/playwright/e2e/auto-evaluation/assets/README.md
diff --git a/web/ee/tests/6-auto-evaluation/assets/types.ts b/web/ee/tests/playwright/e2e/auto-evaluation/assets/types.ts
similarity index 100%
rename from web/ee/tests/6-auto-evaluation/assets/types.ts
rename to web/ee/tests/playwright/e2e/auto-evaluation/assets/types.ts
diff --git a/web/ee/tests/6-auto-evaluation/index.ts b/web/ee/tests/playwright/e2e/auto-evaluation/index.ts
similarity index 100%
rename from web/ee/tests/6-auto-evaluation/index.ts
rename to web/ee/tests/playwright/e2e/auto-evaluation/index.ts
diff --git a/web/ee/tests/6-auto-evaluation/run-auto-evaluation.spec.ts b/web/ee/tests/playwright/e2e/auto-evaluation/run-auto-evaluation.spec.ts
similarity index 100%
rename from web/ee/tests/6-auto-evaluation/run-auto-evaluation.spec.ts
rename to web/ee/tests/playwright/e2e/auto-evaluation/run-auto-evaluation.spec.ts
diff --git a/web/ee/tests/6-auto-evaluation/tests.ts b/web/ee/tests/playwright/e2e/auto-evaluation/tests.ts
similarity index 100%
rename from web/ee/tests/6-auto-evaluation/tests.ts
rename to web/ee/tests/playwright/e2e/auto-evaluation/tests.ts
diff --git a/web/ee/tests/8-deployment/deploy-variant.spec.ts b/web/ee/tests/playwright/e2e/deployment/deploy-variant.spec.ts
similarity index 64%
rename from web/ee/tests/8-deployment/deploy-variant.spec.ts
rename to web/ee/tests/playwright/e2e/deployment/deploy-variant.spec.ts
index 0f613a356e..6a7bf58c0c 100644
--- a/web/ee/tests/8-deployment/deploy-variant.spec.ts
+++ b/web/ee/tests/playwright/e2e/deployment/deploy-variant.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import deploymentTests from "@agenta/oss/tests/8-deployment"
+import deploymentTests from "@agenta/oss/tests/playwright/8-deployment"
 
 test.describe("Deployment: test deployment", deploymentTests)
diff --git a/web/ee/tests/9-human-annotation/assets/types.ts b/web/ee/tests/playwright/e2e/human-annotation/assets/types.ts
similarity index 100%
rename from web/ee/tests/9-human-annotation/assets/types.ts
rename to web/ee/tests/playwright/e2e/human-annotation/assets/types.ts
diff --git a/web/ee/tests/9-human-annotation/human-annotation.spec.ts b/web/ee/tests/playwright/e2e/human-annotation/human-annotation.spec.ts
similarity index 100%
rename from web/ee/tests/9-human-annotation/human-annotation.spec.ts
rename to web/ee/tests/playwright/e2e/human-annotation/human-annotation.spec.ts
diff --git a/web/ee/tests/9-human-annotation/index.ts b/web/ee/tests/playwright/e2e/human-annotation/index.ts
similarity index 100%
rename from web/ee/tests/9-human-annotation/index.ts
rename to web/ee/tests/playwright/e2e/human-annotation/index.ts
diff --git a/web/ee/tests/9-human-annotation/tests.ts b/web/ee/tests/playwright/e2e/human-annotation/tests.ts
similarity index 97%
rename from web/ee/tests/9-human-annotation/tests.ts
rename to web/ee/tests/playwright/e2e/human-annotation/tests.ts
index 14893b83ae..5200108d2d 100644
--- a/web/ee/tests/9-human-annotation/tests.ts
+++ b/web/ee/tests/playwright/e2e/human-annotation/tests.ts
@@ -11,7 +11,7 @@ const testWithHumanFixtures = baseTest.extend<HumanEvaluationFixtures>({
         await use(async (appId: string) => {
             await page.goto(`/apps/${appId}/evaluations?selectedEvaluation=human_annotation`)
             await expect(page).toHaveURL(
-                `/apps/${appId}/evaluations?selectedEvaluation=human_annotation`,
+                new RegExp(`/apps/${appId}/evaluations\\?selectedEvaluation=human_annotation`),
             )
 
             const evaluationRunsResponse = await waitForApiResponse<{
@@ -52,7 +52,7 @@ const testWithHumanFixtures = baseTest.extend<HumanEvaluationFixtures>({
         await use(async (appId: string) => {
             await page.goto(`/apps/${appId}/evaluations?selectedEvaluation=human_annotation`)
             await expect(page).toHaveURL(
-                `/apps/${appId}/evaluations?selectedEvaluation=human_annotation`,
+                new RegExp(`/apps/${appId}/evaluations\\?selectedEvaluation=human_annotation`),
             )
 
             const runs = await apiHelpers.getEvaluationRuns()
diff --git a/web/ee/tests/7-observability/observability.spec.ts b/web/ee/tests/playwright/e2e/observability/observability.spec.ts
similarity index 64%
rename from web/ee/tests/7-observability/observability.spec.ts
rename to web/ee/tests/playwright/e2e/observability/observability.spec.ts
index 98908200a9..efc16d5672 100644
--- a/web/ee/tests/7-observability/observability.spec.ts
+++ b/web/ee/tests/playwright/e2e/observability/observability.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import observabilityTests from "@agenta/oss/tests/7-observability"
+import observabilityTests from "@agenta/oss/tests/playwright/7-observability"
 
 test.describe("Observability: test observability", observabilityTests)
diff --git a/web/ee/tests/3-playground/run-variant.spec.ts b/web/ee/tests/playwright/e2e/playground/run-variant.spec.ts
similarity index 63%
rename from web/ee/tests/3-playground/run-variant.spec.ts
rename to web/ee/tests/playwright/e2e/playground/run-variant.spec.ts
index 5fc8618686..cb725ad039 100644
--- a/web/ee/tests/3-playground/run-variant.spec.ts
+++ b/web/ee/tests/playwright/e2e/playground/run-variant.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import playgroundTests from "@agenta/oss/tests/3-playground"
+import playgroundTests from "@agenta/oss/tests/playwright/3-playground"
 
 test.describe("Playground: Run Variant", playgroundTests)
diff --git a/web/ee/tests/4-prompt-registry/prompt-registry-flow.spec.ts b/web/ee/tests/playwright/e2e/prompt-registry/prompt-registry-flow.spec.ts
similarity index 61%
rename from web/ee/tests/4-prompt-registry/prompt-registry-flow.spec.ts
rename to web/ee/tests/playwright/e2e/prompt-registry/prompt-registry-flow.spec.ts
index 511bd060ef..f0c9cdb2d3 100644
--- a/web/ee/tests/4-prompt-registry/prompt-registry-flow.spec.ts
+++ b/web/ee/tests/playwright/e2e/prompt-registry/prompt-registry-flow.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import promptRegistryTests from "@agenta/oss/tests/4-prompt-registry"
+import promptRegistryTests from "@agenta/oss/tests/playwright/4-prompt-registry"
 
 test.describe("Prompt Registry Flow", promptRegistryTests)
diff --git a/web/ee/tests/1-settings/api-keys-management.spec.ts b/web/ee/tests/playwright/e2e/settings/api-keys-management.spec.ts
similarity index 62%
rename from web/ee/tests/1-settings/api-keys-management.spec.ts
rename to web/ee/tests/playwright/e2e/settings/api-keys-management.spec.ts
index 1395cba61f..4ec1e82737 100644
--- a/web/ee/tests/1-settings/api-keys-management.spec.ts
+++ b/web/ee/tests/playwright/e2e/settings/api-keys-management.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import apiKeysTests from "@agenta/oss/tests/1-settings/api-keys"
+import apiKeysTests from "@agenta/oss/tests/playwright/1-settings/api-keys"
 
 test.skip("Settings: API Keys Management", apiKeysTests)
diff --git a/web/ee/tests/1-settings/model-hub.spec.ts b/web/ee/tests/playwright/e2e/settings/model-hub.spec.ts
similarity index 60%
rename from web/ee/tests/1-settings/model-hub.spec.ts
rename to web/ee/tests/playwright/e2e/settings/model-hub.spec.ts
index 186de6222c..da5392a202 100644
--- a/web/ee/tests/1-settings/model-hub.spec.ts
+++ b/web/ee/tests/playwright/e2e/settings/model-hub.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import modelHubTests from "@agenta/oss/tests/1-settings/model-hub"
+import modelHubTests from "@agenta/oss/tests/playwright/1-settings/model-hub"
 
 test.describe("Settings: Model Hub", modelHubTests)
diff --git a/web/ee/tests/5-testsset/testset.spec.ts b/web/ee/tests/playwright/e2e/testsset/testset.spec.ts
similarity index 66%
rename from web/ee/tests/5-testsset/testset.spec.ts
rename to web/ee/tests/playwright/e2e/testsset/testset.spec.ts
index 5f5ed87486..2e3c8f2d9b 100644
--- a/web/ee/tests/5-testsset/testset.spec.ts
+++ b/web/ee/tests/playwright/e2e/testsset/testset.spec.ts
@@ -1,4 +1,4 @@
 import {test} from "@agenta/web-tests/tests/fixtures/base.fixture"
-import testsetTests from "@agenta/oss/tests/5-testsset"
+import testsetTests from "@agenta/oss/tests/playwright/5-testsset"
 
 test.describe("Testsets: Interact with testsets", testsetTests)
diff --git a/web/oss/src/components/pages/overview/deployments/DeploymentDrawer/index.tsx b/web/oss/src/components/pages/overview/deployments/DeploymentDrawer/index.tsx
index 40dec368dd..f2fdb7b2ae 100644
--- a/web/oss/src/components/pages/overview/deployments/DeploymentDrawer/index.tsx
+++ b/web/oss/src/components/pages/overview/deployments/DeploymentDrawer/index.tsx
@@ -120,11 +120,7 @@ const DeploymentDrawer = ({
                         {selectedEnvironment.deployed_variant_name && (
                             <Space orientation="horizontal">
                                 <Tooltip
-                                    title={
-                                        isDemo()
-                                            ? ""
-                                            : "History available in Cloud/Enterprise editions only"
-                                    }
+                                    title={isDemo() ? "" : "History available in Cloud/EE only"}
                                 >
                                     <Button
                                         size="small"
diff --git a/web/oss/src/components/pages/settings/WorkspaceManage/Modals/InviteUsersModal.tsx b/web/oss/src/components/pages/settings/WorkspaceManage/Modals/InviteUsersModal.tsx
index 4e1e2ce379..bc38100f14 100644
--- a/web/oss/src/components/pages/settings/WorkspaceManage/Modals/InviteUsersModal.tsx
+++ b/web/oss/src/components/pages/settings/WorkspaceManage/Modals/InviteUsersModal.tsx
@@ -219,7 +219,7 @@ const InviteUsersModal: FC<InviteUsersModalProps> = ({
             <Typography.Paragraph type="secondary">
                 Invite members to your team by entering their emails.{" "}
                 {!isEE() || !hasRBAC
-                    ? "Role base access control is available in the cloud and enterprise editions of Agenta"
+                    ? "Role-based access control is available in Cloud/EE."
                     : "You can specify the roles to control the access level of the invited members on Agenta."}
             </Typography.Paragraph>
             <InviteForm
diff --git a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/endpoints/index.tsx b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/endpoints/index.tsx
index 1f047983ff..2dc47af534 100644
--- a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/endpoints/index.tsx
+++ b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/endpoints/index.tsx
@@ -247,7 +247,7 @@ export default function VariantEndpoint() {
                                 ) : (
                                     <Tooltip
                                         placement="right"
-                                        title="Deployment History available in Cloud/Enterprise editions only"
+                                        title="Deployment History available in Cloud/EE only"
                                     >
                                         History
                                     </Tooltip>
diff --git a/web/oss/tests/datalayer/test-apps.ts b/web/oss/tests/manual/datalayer/test-apps.ts
similarity index 100%
rename from web/oss/tests/datalayer/test-apps.ts
rename to web/oss/tests/manual/datalayer/test-apps.ts
diff --git a/web/oss/tests/datalayer/test-observability.ts b/web/oss/tests/manual/datalayer/test-observability.ts
similarity index 100%
rename from web/oss/tests/datalayer/test-observability.ts
rename to web/oss/tests/manual/datalayer/test-observability.ts
diff --git a/web/oss/tests/datalayer/utils/shared-test-setup.ts b/web/oss/tests/manual/datalayer/utils/shared-test-setup.ts
similarity index 100%
rename from web/oss/tests/datalayer/utils/shared-test-setup.ts
rename to web/oss/tests/manual/datalayer/utils/shared-test-setup.ts
diff --git a/web/oss/tests/datalayer/utils/test-analysis.ts b/web/oss/tests/manual/datalayer/utils/test-analysis.ts
similarity index 100%
rename from web/oss/tests/datalayer/utils/test-analysis.ts
rename to web/oss/tests/manual/datalayer/utils/test-analysis.ts
diff --git a/web/oss/tests/datalayer/utils/test-types.ts b/web/oss/tests/manual/datalayer/utils/test-types.ts
similarity index 100%
rename from web/oss/tests/datalayer/utils/test-types.ts
rename to web/oss/tests/manual/datalayer/utils/test-types.ts
diff --git a/web/oss/tests/2-app/assets/README.md b/web/oss/tests/playwright/e2e/app/assets/README.md
similarity index 100%
rename from web/oss/tests/2-app/assets/README.md
rename to web/oss/tests/playwright/e2e/app/assets/README.md
diff --git a/web/oss/tests/2-app/assets/types.ts b/web/oss/tests/playwright/e2e/app/assets/types.ts
similarity index 100%
rename from web/oss/tests/2-app/assets/types.ts
rename to web/oss/tests/playwright/e2e/app/assets/types.ts
diff --git a/web/oss/tests/2-app/create.spec.ts b/web/oss/tests/playwright/e2e/app/create.spec.ts
similarity index 100%
rename from web/oss/tests/2-app/create.spec.ts
rename to web/oss/tests/playwright/e2e/app/create.spec.ts
diff --git a/web/oss/tests/2-app/index.ts b/web/oss/tests/playwright/e2e/app/index.ts
similarity index 100%
rename from web/oss/tests/2-app/index.ts
rename to web/oss/tests/playwright/e2e/app/index.ts
diff --git a/web/oss/tests/2-app/test.ts b/web/oss/tests/playwright/e2e/app/test.ts
similarity index 97%
rename from web/oss/tests/2-app/test.ts
rename to web/oss/tests/playwright/e2e/app/test.ts
index 405aafc676..56f5a8e3bc 100644
--- a/web/oss/tests/2-app/test.ts
+++ b/web/oss/tests/playwright/e2e/app/test.ts
@@ -15,7 +15,7 @@ const testWithAppFixtures = baseTest.extend<AppFixtures>({
     navigateToApps: async ({page, uiHelpers}, use) => {
         await use(async () => {
             await page.goto("/apps")
-            await page.waitForURL("/apps", {waitUntil: "domcontentloaded"})
+            await page.waitForURL("**/apps", {waitUntil: "domcontentloaded"})
             await uiHelpers.expectText("App Management", {
                 role: "heading",
             })
diff --git a/web/oss/tests/8-deployment/deploy-variant.spec.ts b/web/oss/tests/playwright/e2e/deployment/deploy-variant.spec.ts
similarity index 100%
rename from web/oss/tests/8-deployment/deploy-variant.spec.ts
rename to web/oss/tests/playwright/e2e/deployment/deploy-variant.spec.ts
diff --git a/web/oss/tests/8-deployment/index.ts b/web/oss/tests/playwright/e2e/deployment/index.ts
similarity index 100%
rename from web/oss/tests/8-deployment/index.ts
rename to web/oss/tests/playwright/e2e/deployment/index.ts
diff --git a/web/oss/tests/7-observability/index.ts b/web/oss/tests/playwright/e2e/observability/index.ts
similarity index 100%
rename from web/oss/tests/7-observability/index.ts
rename to web/oss/tests/playwright/e2e/observability/index.ts
diff --git a/web/oss/tests/7-observability/observability.spec.ts b/web/oss/tests/playwright/e2e/observability/observability.spec.ts
similarity index 100%
rename from web/oss/tests/7-observability/observability.spec.ts
rename to web/oss/tests/playwright/e2e/observability/observability.spec.ts
diff --git a/web/oss/tests/3-playground/assets/README.md b/web/oss/tests/playwright/e2e/playground/assets/README.md
similarity index 100%
rename from web/oss/tests/3-playground/assets/README.md
rename to web/oss/tests/playwright/e2e/playground/assets/README.md
diff --git a/web/oss/tests/3-playground/assets/constants.ts b/web/oss/tests/playwright/e2e/playground/assets/constants.ts
similarity index 100%
rename from web/oss/tests/3-playground/assets/constants.ts
rename to web/oss/tests/playwright/e2e/playground/assets/constants.ts
diff --git a/web/oss/tests/3-playground/assets/types.ts b/web/oss/tests/playwright/e2e/playground/assets/types.ts
similarity index 100%
rename from web/oss/tests/3-playground/assets/types.ts
rename to web/oss/tests/playwright/e2e/playground/assets/types.ts
diff --git a/web/oss/tests/3-playground/index.ts b/web/oss/tests/playwright/e2e/playground/index.ts
similarity index 98%
rename from web/oss/tests/3-playground/index.ts
rename to web/oss/tests/playwright/e2e/playground/index.ts
index 9af773d8a8..4ea10291be 100644
--- a/web/oss/tests/3-playground/index.ts
+++ b/web/oss/tests/playwright/e2e/playground/index.ts
@@ -1,5 +1,5 @@
 import {COMPLETION_MESSAGES, NEW_VARIABLES, PROMPT_MESSAGES} from "./assets/constants"
-import {test as basePlaygroundTest} from "./tests.spec"
+import {test as basePlaygroundTest} from "./tests"
 
 import {
     createTagString,
diff --git a/web/oss/tests/3-playground/run-variant.spec.ts b/web/oss/tests/playwright/e2e/playground/run-variant.spec.ts
similarity index 100%
rename from web/oss/tests/3-playground/run-variant.spec.ts
rename to web/oss/tests/playwright/e2e/playground/run-variant.spec.ts
diff --git a/web/oss/tests/3-playground/tests.spec.ts b/web/oss/tests/playwright/e2e/playground/tests.ts
similarity index 100%
rename from web/oss/tests/3-playground/tests.spec.ts
rename to web/oss/tests/playwright/e2e/playground/tests.ts
diff --git a/web/oss/tests/4-prompt-registry/index.ts b/web/oss/tests/playwright/e2e/prompt-registry/index.ts
similarity index 100%
rename from web/oss/tests/4-prompt-registry/index.ts
rename to web/oss/tests/playwright/e2e/prompt-registry/index.ts
diff --git a/web/oss/tests/4-prompt-registry/prompt-registry-flow.spec.ts b/web/oss/tests/playwright/e2e/prompt-registry/prompt-registry-flow.spec.ts
similarity index 100%
rename from web/oss/tests/4-prompt-registry/prompt-registry-flow.spec.ts
rename to web/oss/tests/playwright/e2e/prompt-registry/prompt-registry-flow.spec.ts
diff --git a/web/oss/tests/1-settings/api-keys-management.spec.ts b/web/oss/tests/playwright/e2e/settings/api-keys-management.spec.ts
similarity index 100%
rename from web/oss/tests/1-settings/api-keys-management.spec.ts
rename to web/oss/tests/playwright/e2e/settings/api-keys-management.spec.ts
diff --git a/web/oss/tests/1-settings/api-keys.ts b/web/oss/tests/playwright/e2e/settings/api-keys.ts
similarity index 100%
rename from web/oss/tests/1-settings/api-keys.ts
rename to web/oss/tests/playwright/e2e/settings/api-keys.ts
diff --git a/web/oss/tests/1-settings/model-hub.spec.ts b/web/oss/tests/playwright/e2e/settings/model-hub.spec.ts
similarity index 100%
rename from web/oss/tests/1-settings/model-hub.spec.ts
rename to web/oss/tests/playwright/e2e/settings/model-hub.spec.ts
diff --git a/web/oss/tests/1-settings/model-hub.ts b/web/oss/tests/playwright/e2e/settings/model-hub.ts
similarity index 100%
rename from web/oss/tests/1-settings/model-hub.ts
rename to web/oss/tests/playwright/e2e/settings/model-hub.ts
diff --git a/web/oss/tests/playwright/e2e/smoke.spec.ts b/web/oss/tests/playwright/e2e/smoke.spec.ts
new file mode 100644
index 0000000000..e67117c74f
--- /dev/null
+++ b/web/oss/tests/playwright/e2e/smoke.spec.ts
@@ -0,0 +1,9 @@
+import {test, expect} from "@playwright/test"
+
+test("smoke: auth works and can navigate to apps", async ({page}) => {
+    test.setTimeout(10000)
+    await page.goto("/apps")
+    await page.waitForURL("**/apps", {timeout: 5000})
+    await expect(page).toHaveURL(/apps/)
+    console.log("[smoke] Current URL:", page.url())
+})
diff --git a/web/oss/tests/5-testsset/index.ts b/web/oss/tests/playwright/e2e/testsset/index.ts
similarity index 100%
rename from web/oss/tests/5-testsset/index.ts
rename to web/oss/tests/playwright/e2e/testsset/index.ts
diff --git a/web/oss/tests/5-testsset/testset.spec.ts b/web/oss/tests/playwright/e2e/testsset/testset.spec.ts
similarity index 100%
rename from web/oss/tests/5-testsset/testset.spec.ts
rename to web/oss/tests/playwright/e2e/testsset/testset.spec.ts
diff --git a/web/package.json b/web/package.json
index d405f000f7..8da25458a2 100644
--- a/web/package.json
+++ b/web/package.json
@@ -44,7 +44,7 @@
         "turbo": "2.8.2",
         "typescript-eslint": "^8.50.0"
     },
-    "packageManager": "pnpm@10.4.1",
+    "packageManager": "pnpm@10.29.2",
     "scripts": {
         "build-oss": "turbo run build --filter=@agenta/oss",
         "build-ee": "turbo run build --filter=@agenta/ee",
diff --git a/web/tests/playwright.config.ts b/web/tests/playwright.config.ts
index 9e09ba264e..d944ebd377 100644
--- a/web/tests/playwright.config.ts
+++ b/web/tests/playwright.config.ts
@@ -5,7 +5,6 @@ import {fileURLToPath} from "url"
 import {defineConfig} from "@playwright/test"
 import dotenv from "dotenv"
 
-import {allProjects} from "./playwright/config/projects"
 
 // Get current directory in ESM
 const __filename = fileURLToPath(import.meta.url)
@@ -28,7 +27,7 @@ if (missingEnvVars.length > 0) {
  */
 const require = createRequire(import.meta.url)
 export default defineConfig({
-    testDir: `../${process.env.PROJECT_DIRECTORY}/tests`,
+    testDir: `../${process.env.AGENTA_LICENSE || "oss"}/tests/playwright/e2e`,
     fullyParallel: false, // Temporarily disabled parallel worker
     forbidOnly: !!process.env.CI,
     retries: process.env.CI ? 2 : process.env.RETRIES ? parseInt(process.env.RETRIES) : 0,
@@ -47,11 +46,10 @@ export default defineConfig({
     },
 
     use: {
+        baseURL: process.env.AGENTA_WEB_URL || "http://localhost",
         trace: "on-first-retry",
         screenshot: "only-on-failure",
         video: "retain-on-failure",
         storageState: "state.json",
     },
-
-    projects: allProjects,
 })
diff --git a/web/tests/playwright/config/deployments.ts b/web/tests/playwright/config/deployments.ts
deleted file mode 100644
index f98fc02ca6..0000000000
--- a/web/tests/playwright/config/deployments.ts
+++ /dev/null
@@ -1,15 +0,0 @@
-import {TestEnvironment} from "./testTags"
-import type PlaywrightConfig from "./types"
-
-/**
- * Base URLs for different deployment environments
- * Maps environment types to their respective API endpoints
- */
-export const deployments: Record<PlaywrightConfig.DeploymentType, string> = {
-    [TestEnvironment.local]: process.env.AGENTA_WEB_URL || "http://localhost",
-    [TestEnvironment.staging]: "https://cloud.staging.agenta.ai",
-    [TestEnvironment.beta]: "https://cloud.beta.agenta.ai",
-    [TestEnvironment.oss]: "https://oss.agenta.ai",
-    [TestEnvironment.demo]: "https://cloud.demo.agenta.ai",
-    [TestEnvironment.prod]: "https://cloud.agenta.ai",
-} as const
diff --git a/web/tests/playwright/config/projects.ts b/web/tests/playwright/config/projects.ts
index 98bd59666a..e9a6164d47 100644
--- a/web/tests/playwright/config/projects.ts
+++ b/web/tests/playwright/config/projects.ts
@@ -1,36 +1,13 @@
 import {devices, type Project} from "@playwright/test"
 
-import {deployments} from "./deployments"
-import {TestEnvironment} from "./testTags"
-import type PlaywrightConfig from "./types"
-
 /**
- * Base configuration for all test projects
- * Uses Chrome Desktop as the default browser
+ * Single project configuration.
+ * Base URL comes from AGENTA_WEB_URL, license from AGENTA_LICENSE.
  */
-const baseConfig = {
+export const project: Project = {
+    name: process.env.AGENTA_LICENSE || "oss",
     use: {
         ...devices["Desktop Chrome"],
+        baseURL: process.env.AGENTA_WEB_URL || "http://localhost",
     },
 }
-
-/**
- * Creates a project configuration for a specific environment
- * @param env - Target environment type
- * @returns Playwright project configuration
- */
-const createProjectConfig = (env: PlaywrightConfig.TestEnvironmentType): Project => ({
-    ...baseConfig,
-    name: env,
-    use: {...baseConfig.use, baseURL: deployments[env]},
-})
-
-// Generate project configurations for all environments
-const baseProjects = Object.keys(TestEnvironment).map((env) =>
-    createProjectConfig(env as PlaywrightConfig.TestEnvironmentType),
-)
-
-/**
- * Combined project configurations for all environments
- */
-export const allProjects = [...baseProjects]
diff --git a/web/tests/playwright/config/testTags.ts b/web/tests/playwright/config/testTags.ts
index bdf255a8f4..ae296186d9 100644
--- a/web/tests/playwright/config/testTags.ts
+++ b/web/tests/playwright/config/testTags.ts
@@ -33,39 +33,33 @@ export const TestPath = {
 } as const
 
 /**
- * Deployment environments where tests can be executed
+ * Role types for different test scenarios
  */
-export const TestEnvironment = {
-    local: "local", // Local deployment
-    staging: "staging", // Staging environment
-    beta: "beta", // Beta environment
-    oss: "oss", // OSS environment
-    demo: "demo", // Demo environment
-    prod: "prod", // Production environment
+export const TestRoleType = {
+    Owner: "owner",
+    Editor: "editor",
+    Viewer: "viewer",
 } as const
 
 /**
- * Feature availability scope for different deployment types
+ * Plan types for different test scenarios
  */
-export const TestFeatureLicenseScopeType = {
-    EE: "ee", // Features only available in ee
+export const TestPlanType = {
+    Hobby: "hobby",
+    Pro: "pro",
 } as const
 
 /**
- * Permission types for different test scenarios
+ * Cost types for test execution
  */
-export const TestPermissionType = {
-    Owner: "owner",
-    Editor: "editor",
-    Viewer: "viewer",
+export const TestCostType = {
+    Free: "free", // No monetary cost
+    Paid: "paid", // Uses paid third-party services
 } as const
 
-/**
- * Entitlement types for different test scenarios
- */
-export const TestEntitlementType = {
-    Hobby: "hobby",
-    Pro: "pro",
+export const TestLicenseType = {
+    OSS: "oss",
+    EE: "ee",
 } as const
 
 export const TestLensType = {
@@ -84,19 +78,6 @@ export const TestSpeedType = {
     SLOW: "slow",
 } as const
 
-/**
- * Environment-specific feature configuration
- * Defines which features are available in each environment
- */
-export const environmentFeatures: PlaywrightConfig.EnvironmentProjectConfig = {
-    local: {},
-    staging: {},
-    beta: {},
-    oss: {},
-    demo: {},
-    prod: {},
-} as const
-
 /**
  * Tag argument definitions for CLI and test decoration
  * Maps tag types to their CLI flags and test decoration prefixes
@@ -105,13 +86,13 @@ export const TAG_ARGUMENTS: Record<PlaywrightConfig.TestTagType, PlaywrightConfi
     scope: {flag: "-scope", prefix: "@scope:"},
     coverage: {flag: "-coverage", prefix: "@coverage:"},
     path: {flag: "-path", prefix: "@path:"},
-    env: {flag: "-env", prefix: "@env:"},
-    feature: {flag: "-feature", prefix: "@feature:"},
-    entitlement: {flag: "-entitlement", prefix: "@entitlement:"},
-    permission: {flag: "-permission", prefix: "@permission:"},
+    plan: {flag: "-plan", prefix: "@plan:"},
+    role: {flag: "-role", prefix: "@role:"},
     lens: {flag: "-lens", prefix: "@lens:"},
     case: {flag: "-case", prefix: "@case:"},
     speed: {flag: "-speed", prefix: "@speed:"},
+    license: {flag: "-license", prefix: "@license:"},
+    cost: {flag: "-cost", prefix: "@cost:"},
 } as const
 
 /**
@@ -128,7 +109,5 @@ export type {
     TestTagType,
     TestTag,
     TagArgument,
-    TestEnvironmentType,
     ProjectFeatureConfig,
-    EnvironmentProjectConfig,
 } from "./types"
diff --git a/web/tests/playwright/config/types.d.ts b/web/tests/playwright/config/types.d.ts
index fd0b698a92..9685a79992 100644
--- a/web/tests/playwright/config/types.d.ts
+++ b/web/tests/playwright/config/types.d.ts
@@ -9,31 +9,34 @@ declare namespace PlaywrightConfig {
         (typeof import("./testTags").TestCoverage)[keyof typeof import("./testTags").TestCoverage]
     type TestPathType =
         (typeof import("./testTags").TestPath)[keyof typeof import("./testTags").TestPath]
-    type TestEnvironmentType = keyof typeof import("./testTags").TestEnvironment
-    type TestFeatureLicenseScopeType =
-        (typeof import("./testTags").TestFeatureScope)[keyof typeof import("./testTags").TestFeatureScope]
-    type TestEntitlementType =
-        (typeof import("./testTags").TestEntitlementType)[keyof typeof import("./testTags").TestEntitlementType]
-    type TestPermissionType =
-        (typeof import("./testTags").TestPermissionType)[keyof typeof import("./testTags").TestPermissionType]
+    type TestPlanType =
+        (typeof import("./testTags").TestPlanType)[keyof typeof import("./testTags").TestPlanType]
+    type TestRoleType =
+        (typeof import("./testTags").TestRoleType)[keyof typeof import("./testTags").TestRoleType]
     type TestLensType =
         (typeof import("./testTags").TestLensType)[keyof typeof import("./testTags").TestLensType]
     type TestcaseType =
         (typeof import("./testTags").TestcaseType)[keyof typeof import("./testTags").TestcaseType]
+    type TestSpeedType =
+        (typeof import("./testTags").TestSpeedType)[keyof typeof import("./testTags").TestSpeedType]
+    type TestCostType =
+        (typeof import("./testTags").TestCostType)[keyof typeof import("./testTags").TestCostType]
+    type TestLicenseType =
+        (typeof import("./testTags").TestLicenseType)[keyof typeof import("./testTags").TestLicenseType]
 
     /** Test tag system configuration */
     type TestTagType =
         | "scope"
         | "coverage"
         | "path"
-        | "env"
-        | "feature"
-        | "entitlement"
-        | "permission"
+        | "plan"
+        | "role"
         | "lens"
         | "case"
         | "speed"
-    type TestTag = TestScopeType | TestCoverageType | TestPathType | TestEnvironmentType
+        | "license"
+        | "cost"
+    type TestTag = TestScopeType | TestCoverageType | TestPathType
 
     /** Tag argument structure for CLI and test decoration */
     interface TagArgument {
@@ -43,13 +46,8 @@ declare namespace PlaywrightConfig {
 
     /** Project feature configuration for different environments */
     interface ProjectFeatureConfig {
-        // readonly features: TestFeatureScopeType[] // Available features in environment
+        // Configuration for project-specific features
     }
-
-    /** Environment-specific project configurations */
-    type EnvironmentProjectConfig = Record<TestEnvironmentType, ProjectFeatureConfig>
-    /** Deployment environment type alias */
-    type DeploymentType = TestEnvironmentType
 }
 
 export = PlaywrightConfig
diff --git a/web/tests/playwright/global-setup.ts b/web/tests/playwright/global-setup.ts
index 336438de91..50b7deab6e 100644
--- a/web/tests/playwright/global-setup.ts
+++ b/web/tests/playwright/global-setup.ts
@@ -2,7 +2,7 @@
  * Automates Playwright authentication and storage setup.
  */
 
-import {chromium, FullConfig} from "@playwright/test"
+import {chromium} from "@playwright/test"
 
 import {waitForApiResponse} from "../tests/fixtures/base.fixture/apiHelpers"
 import {
@@ -20,21 +20,19 @@ import {getTestmailClient} from "../utils/testmail"
  * Handles both login and signup flows.
  * Stores authenticated state in a file to be reused by tests.
  */
-async function globalSetup(config: FullConfig) {
+async function globalSetup() {
     // Automate authentication before Playwright tests
     console.log("[global-setup] Starting global setup for authentication")
 
-    const project = config.projects.find((project) => project.name === process.env.PROJECT)
-    console.log(`[global-setup] Resolved project: ${process.env.PROJECT}`)
-    if (!project) {
-        throw new Error(`Project ${process.env.PROJECT} not found`)
-    }
-    const {baseURL, storageState} = project.use
+    const baseURL = process.env.AGENTA_WEB_URL || "http://localhost"
+    const license = process.env.AGENTA_LICENSE || "oss"
+    const storageState = "state.json"
+    console.log(`[global-setup] Base URL: ${baseURL}, License: ${license}`)
     const timeout = 60000
     const inputDelay = 100
 
-    const {email, password} = createInitialUserState({
-        name: project.name,
+    const {email} = createInitialUserState({
+        name: license,
     })
 
     console.log("[global-setup] Launching browser")
@@ -42,7 +40,7 @@ async function globalSetup(config: FullConfig) {
     const page = await browser.newPage()
 
     console.log(`[global-setup] Navigating to auth page: ${baseURL}/auth`)
-    await page.goto(`${baseURL}/auth`)
+    await page.goto(`${baseURL}/auth`, {timeout})
 
     console.log("[global-setup] Clearing local storage")
 
@@ -63,97 +61,161 @@ async function globalSetup(config: FullConfig) {
         }
     }
 
+    /**
+     * Handles the post-signup onboarding flow if it appears.
+     * The post-signup form requires POSTHOG_API_KEY to load the survey.
+     * Without it, the page auto-redirects to /get-started or /apps.
+     */
+    async function handlePostSignup(): Promise<void> {
+        try {
+            await page.waitForURL("**/post-signup", {waitUntil: "load", timeout: 10000})
+        } catch {
+            // No post-signup flow — already redirected to app
+            console.log("[global-setup] No post-signup redirect detected, continuing")
+            return
+        }
+
+        console.log("[global-setup] New user detected, on post-signup page")
+
+        // Race: the survey form loads ("Tell us about yourself") OR
+        // the page redirects away (no PostHog API key → redirects to /get-started or /apps)
+        const tellUsAboutYourselfLocator = page.getByText("Tell us about yourself")
+        const redirected = page.waitForURL(
+            (url) => !url.pathname.endsWith("/post-signup"),
+            {timeout: 15000},
+        )
+        const surveyLoaded = tellUsAboutYourselfLocator
+            .waitFor({state: "visible", timeout: 15000})
+            .then(() => "survey" as const)
+
+        const result = await Promise.race([
+            surveyLoaded,
+            redirected.then(() => "redirected" as const),
+        ])
+
+        if (result === "redirected") {
+            console.log("[global-setup] Post-signup redirected (no PostHog survey), continuing")
+            return
+        }
+
+        console.log("[global-setup] PostHog survey loaded, completing post-signup flow")
+        const isOptionVisible = await page.getByRole("option", {name: "Hobbyist"}).isVisible()
+
+        if (isOptionVisible) {
+            await selectOption(page, {text: "2-10"})
+            await selectOption(page, {text: "Hobbyist"})
+            await selectOption(page, {text: "Just exploring"})
+            await clickButton(page, "Continue")
+
+            const whatBringsYouHereLocator = page.getByText("What brings you here?")
+            await whatBringsYouHereLocator.waitFor({state: "visible"})
+
+            await selectOption(page, {text: "Evaluating LLM Applications"})
+            await selectOption(page, {text: "Github"})
+            await clickButton(page, "Continue")
+            console.log("[global-setup] Post-signup flow completed")
+            await waitForPath(page, `${baseURL}/apps`)
+        } else {
+            console.log("[global-setup] Post-signup flow not completed due to missing options")
+        }
+    }
+
     const timestamp = Date.now()
-    console.log(`[global-setup] Typing email: ${email}`)
-    await typeWithDelay(page, 'input[type="email"]', email)
-    const signinButton = await page.getByRole("button", {name: "Sign in"})
 
+    // For OSS, use admin credentials from env vars
+    const loginEmail =
+        license === "oss" ? process.env.AGENTA_ADMIN_EMAIL || email : email
+    const adminPassword = process.env.AGENTA_ADMIN_PASSWORD
+
+    console.log(`[global-setup] Typing email: ${loginEmail}`)
+    await typeWithDelay(page, 'input[type="email"]', loginEmail)
+
+    // Detect which auth flow the page shows
+    const signinButton = page.getByRole("button", {name: "Sign in"})
     const hasSigninButton = await signinButton.isVisible()
 
-    if (hasSigninButton) {
-        // Password sign-in flow
-        if (!password) {
-            throw new Error("Password is required for password sign-in flow")
-        }
+    try {
+        if (hasSigninButton) {
+            // Password sign-in flow (OSS with pre-created admin account)
+            const password = adminPassword
+            if (!password) {
+                throw new Error(
+                    "AGENTA_ADMIN_PASSWORD is required for the password sign-in flow",
+                )
+            }
 
-        try {
-            console.log("[global-setup] Typing password")
+            console.log("[global-setup] Password sign-in flow detected")
             await typeWithDelay(page, "input[type='password']", password)
-            console.log("[global-setup] Clicking Sign in button")
             await signinButton.click()
             console.log(`[global-setup] Waiting for navigation to: ${baseURL}/apps`)
             await waitForPath(page, `${baseURL}/apps`)
-        } catch (error) {
-            console.error("[global-setup] Error in login flow:", error)
-            throw error
-        } finally {
-            console.log("[global-setup] Saving storage state and closing browser")
-            await page.context().storageState({path: storageState as string})
-            await browser.close()
-        }
-    } else {
-        // Email verification and OTP flow
-        await clickButton(page, "Continue with email")
-        const verifyEmailLocator = page.getByText("Verify your email")
-        await verifyEmailLocator.waitFor({state: "visible"})
-        try {
-            console.log("[global-setup] Waiting for OTP email")
-            const otp = await testmail.waitForOTP(email, {
-                timeout,
-                timestamp_from: timestamp,
-            })
-            console.log("[global-setup] OTP received, preparing to input")
-            const responsePromise = waitForApiResponse<AuthResponse>(page, {
-                route: "/api/auth/signinup/code/consume",
-                validateStatus: true,
-            })
-
-            await fillOTPDigits(otp, inputDelay)
-            console.log("[global-setup] Clicking Next button after OTP input")
-            await clickButton(page, "Next")
-            const responseData = await responsePromise
-
-            if (responseData.createdNewRecipeUser) {
-                console.log("[global-setup] New user detected, completing post-signup flow")
-                await page.waitForURL(`${baseURL}/post-signup`, {waitUntil: "load"})
-
-                const tellUsAboutYourselfLocator = page.getByText("Tell us about yourself")
-                await tellUsAboutYourselfLocator.waitFor({state: "visible"})
-                const isOptionVisible = await page
-                    .getByRole("option", {name: "Hobbyist"})
-                    .isVisible()
-
-                if (isOptionVisible) {
-                    await selectOption(page, {text: "2-10"})
-                    await selectOption(page, {text: "Hobbyist"})
-                    await selectOption(page, {text: "Just exploring"})
-                    await clickButton(page, "Continue")
-
-                    const whatBringsYouHereLocator = page.getByText("What brings you here?")
-                    await whatBringsYouHereLocator.waitFor({state: "visible"})
-
-                    await selectOption(page, {text: "Evaluating LLM Applications"})
-                    await selectOption(page, {
-                        text: "Github",
-                    })
-                    await clickButton(page, "Continue")
-                    console.log("[global-setup] Post-signup flow completed")
-                    console.log(`[global-setup] Waiting for navigation to: ${baseURL}/apps`)
-                    await waitForPath(page, `${baseURL}/apps`)
-                } else {
-                    console.log(
-                        "[global-setup] Post-signup flow not completed due to missing options",
-                    )
+        } else {
+            // Click the email continue button (text varies by deployment)
+            const continueWithEmail = page.getByRole("button", {name: "Continue with email"})
+            const continueButton = page.getByRole("button", {name: "Continue", exact: true})
+            if (await continueWithEmail.isVisible()) {
+                await continueWithEmail.click()
+            } else {
+                await continueButton.click()
+            }
+
+            // Wait to see which flow appears: OTP or password signup
+            const verifyEmailLocator = page.getByText("Verify your email")
+            const passwordInput = page.locator("input[type='password']")
+
+            // Race: whichever appears first determines the flow
+            await Promise.race([
+                verifyEmailLocator.waitFor({state: "visible", timeout}),
+                passwordInput.waitFor({state: "visible", timeout}),
+            ])
+
+            if (await passwordInput.isVisible()) {
+                // Email + password signup/signin flow (local EE with SuperTokens)
+                console.log("[global-setup] Email + password flow detected")
+                const testPassword = "TestPass123!"
+                await typeWithDelay(page, "input[type='password']", testPassword)
+                await clickButton(page, "Continue with password")
+
+                await handlePostSignup()
+
+                // Wait for the page to settle on an authenticated URL
+                console.log("[global-setup] Waiting for authenticated page")
+                await page.waitForURL(
+                    (url) => !url.pathname.includes("/auth") && !url.pathname.endsWith("/post-signup"),
+                    {timeout},
+                )
+                console.log(`[global-setup] Settled on: ${page.url()}`)
+            } else {
+                // OTP flow (cloud EE with SuperTokens passwordless)
+                console.log("[global-setup] OTP flow detected")
+                console.log("[global-setup] Waiting for OTP email")
+                const otp = await testmail.waitForOTP(email, {
+                    timeout,
+                    timestamp_from: timestamp,
+                })
+                console.log("[global-setup] OTP received, preparing to input")
+                const responsePromise = waitForApiResponse<AuthResponse>(page, {
+                    route: "/api/auth/signinup/code/consume",
+                    validateStatus: true,
+                })
+
+                await fillOTPDigits(otp, inputDelay)
+                console.log("[global-setup] Clicking Next button after OTP input")
+                await clickButton(page, "Next")
+                const responseData = await responsePromise
+
+                if (responseData.createdNewRecipeUser) {
+                    await handlePostSignup()
                 }
             }
-        } catch (error) {
-            console.error("[global-setup] Error in login flow:", error)
-            throw error
-        } finally {
-            console.log("[global-setup] Saving storage state and closing browser")
-            await page.context().storageState({path: storageState as string})
-            await browser.close()
         }
+    } catch (error) {
+        console.error("[global-setup] Error in login flow:", error)
+        throw error
+    } finally {
+        console.log("[global-setup] Saving storage state and closing browser")
+        await page.context().storageState({path: storageState as string})
+        await browser.close()
     }
 }
 
diff --git a/web/tests/playwright/global-teardown.ts b/web/tests/playwright/global-teardown.ts
index 6144596f71..bc7633111f 100644
--- a/web/tests/playwright/global-teardown.ts
+++ b/web/tests/playwright/global-teardown.ts
@@ -13,24 +13,20 @@ import {fileURLToPath} from "url"
  * Attempts to delete all accounts in local OSS testing environments.
  * Uses environment variables to determine eligibility and endpoint configuration.
  */
-async function globalTeardown(config: any) {
+async function globalTeardown() {
     console.log("[global-teardown] Starting global teardown...")
-    const project = config.projects.find((project: any) => project.name === process.env.PROJECT)
-
-    if (!project) {
-        throw new Error(`Project ${process.env.PROJECT} not found`)
-    }
-    const {baseURL} = project.use
+    const baseURL = process.env.AGENTA_WEB_URL || "http://localhost"
     console.log(`[global-teardown] Using web-url: ${baseURL}`)
 
     const token = process.env.AGENTA_AUTH_KEY
     const apiURL = process.env.AGENTA_API_URL || `${baseURL}/api`
     console.log(`[global-teardown] Using api-url: ${apiURL}`)
 
+    const license = process.env.AGENTA_LICENSE || "oss"
     console.log(
-        `[global-teardown] Environment variables - token: ${token ? "present" : "absent"}, LICENSE: ${process.env.LICENSE}, PROJECT: ${process.env.PROJECT}`,
+        `[global-teardown] Environment variables - token: ${token ? "present" : "absent"}, AGENTA_LICENSE: ${license}`,
     )
-    if (token && process.env.LICENSE === "oss" && process.env.PROJECT === "local") {
+    if (token && license === "oss") {
         console.log(
             "[global-teardown] Conditions met for deleting all accounts, sending request...",
         )
diff --git a/web/tests/playwright/scripts/run-tests.ts b/web/tests/playwright/scripts/run-tests.ts
index d7cb395633..03ef7b5394 100644
--- a/web/tests/playwright/scripts/run-tests.ts
+++ b/web/tests/playwright/scripts/run-tests.ts
@@ -1,9 +1,85 @@
 /**
  * Playwright Test Runner Script
- * Executes test suites based on provided command line arguments.
+ * Executes test suites with support for test dimension filtering.
+ *
+ * Converts test dimension flags (--coverage, --lens, etc.) into Playwright --grep patterns.
+ * Example: --coverage smoke --path happy -> --grep "@coverage:smoke.*@path:happy"
  */
 
 import {execSync} from "child_process"
 
-const command = `playwright test ${process.argv.slice(2).join(" ")}`
-execSync(command, {stdio: "inherit"})
+// Test dimension types and their tag prefixes
+const DIMENSION_PREFIXES: Record<string, string> = {
+    coverage: "@coverage:",
+    lens: "@lens:",
+    path: "@path:",
+    case: "@case:",
+    speed: "@speed:",
+    scope: "@scope:",
+    license: "@license:",
+    cost: "@cost:",
+    plan: "@plan:",
+    role: "@role:",
+}
+
+interface ParsedArgs {
+    grepPatterns: string[]
+    playwrightArgs: string[]
+}
+
+function parseArgs(args: string[]): ParsedArgs {
+    const grepPatterns: string[] = []
+    const playwrightArgs: string[] = []
+
+    let i = 0
+    while (i < args.length) {
+        const arg = args[i]
+
+        // Check if this is a dimension flag
+        const dimensionMatch = arg.match(/^--?(coverage|lens|path|case|speed|scope|license|cost|plan|role)$/)
+
+        if (dimensionMatch && i + 1 < args.length) {
+            const dimension = dimensionMatch[1]
+            const value = args[i + 1]
+            const prefix = DIMENSION_PREFIXES[dimension]
+            grepPatterns.push(`${prefix}${value}`)
+            i += 2 // Skip both the flag and its value
+        } else {
+            // Pass through to playwright
+            playwrightArgs.push(arg)
+            i++
+        }
+    }
+
+    return {grepPatterns, playwrightArgs}
+}
+
+function buildCommand(grepPatterns: string[], playwrightArgs: string[]): string {
+    const parts = ["playwright", "test"]
+
+    // Add grep pattern if we have dimension filters
+    if (grepPatterns.length > 0) {
+        // Combine patterns with .* to match all dimensions
+        const grepExpression = grepPatterns.join(".*")
+        parts.push("--grep", `"${grepExpression}"`)
+    }
+
+    // Add remaining playwright arguments
+    parts.push(...playwrightArgs)
+
+    return parts.join(" ")
+}
+
+// Parse command line arguments (skip node and script paths)
+const args = process.argv.slice(2)
+const {grepPatterns, playwrightArgs} = parseArgs(args)
+
+// Build and execute the command
+const command = buildCommand(grepPatterns, playwrightArgs)
+console.log(`Executing: ${command}`)
+
+try {
+    execSync(command, {stdio: "inherit"})
+} catch (error) {
+    process.exit(1)
+}
diff --git a/web/tests/tests/fixtures/base.fixture/apiHelpers/index.ts b/web/tests/tests/fixtures/base.fixture/apiHelpers/index.ts
index 99fec27e7f..513801b693 100644
--- a/web/tests/tests/fixtures/base.fixture/apiHelpers/index.ts
+++ b/web/tests/tests/fixtures/base.fixture/apiHelpers/index.ts
@@ -48,7 +48,7 @@ export const waitForApiResponse = async <T>(page: Page, options: ApiHandlerOptio
 
 export const getApp = async (page: Page, type: APP_TYPE = "completion") => {
     await page.goto("/apps")
-    await page.waitForURL("/apps")
+    await page.waitForURL("**/apps")
 
     const appsResponse = await waitForApiResponse<ListAppsItem[]>(page, {
         route: "/api/apps",
diff --git a/web/tests/tests/fixtures/base.fixture/uiHelpers/helpers.ts b/web/tests/tests/fixtures/base.fixture/uiHelpers/helpers.ts
index dbbf7a9e20..10d9cc50ad 100644
--- a/web/tests/tests/fixtures/base.fixture/uiHelpers/helpers.ts
+++ b/web/tests/tests/fixtures/base.fixture/uiHelpers/helpers.ts
@@ -7,7 +7,12 @@ export const typeWithDelay = async (page: Page, selector: string, text: string,
 }
 
 export const waitForPath = async (page: Page, path: string) => {
-    await page.waitForURL(path, {waitUntil: "domcontentloaded"})
+    // Strip protocol+host if full URL is passed, then match by pathname suffix
+    // to support workspace-scoped URLs (/w/{id}/p/{id}/path)
+    const pathname = path.replace(/^https?:\/\/[^/]+/, "")
+    await page.waitForURL((url) => url.pathname.endsWith(pathname), {
+        waitUntil: "domcontentloaded",
+    })
 }
 
 export const clickButton = async (page: Page, name: string, locator?: Locator) => {
diff --git a/web/tests/tests/fixtures/user.fixture/authHelpers/utilities.ts b/web/tests/tests/fixtures/user.fixture/authHelpers/utilities.ts
index 72695b0cd1..ff93991930 100644
--- a/web/tests/tests/fixtures/user.fixture/authHelpers/utilities.ts
+++ b/web/tests/tests/fixtures/user.fixture/authHelpers/utilities.ts
@@ -1,76 +1,39 @@
 import {WorkerInfo} from "@playwright/test"
 
-import {TestEnvironment, type TestEnvironmentType} from "../../../../playwright/config/testTags"
 import {getTestmailClient} from "../../../../utils/testmail"
 import {UserState} from "../types"
 
-/**
- * Determines the test environment based on the Playwright worker's project name
- *
- * @param workerInfo - Playwright worker information containing project details
- * @returns The determined environment type (local, staging, beta, oss)
- * @throws Error if project name doesn't match a known environment
- */
-export function determineEnvironment(project: Partial<WorkerInfo["project"]>): TestEnvironmentType {
-    const projectName = project.name as TestEnvironmentType
-
-    if (!Object.keys(TestEnvironment).includes(projectName)) {
-        throw new Error(
-            `Invalid project name "${projectName}". Must be one of: ${Object.keys(
-                TestEnvironment,
-            ).join(", ")}`,
-        )
-    }
-
-    return projectName
-}
-
-/**
- * @deprecated will be removed in a future release since both ee and oss now require authentication
- * Determines if authentication is required based on environment and test tags
- */
-export function requiresAuthentication(environment: TestEnvironmentType, tags?: string[]): boolean {
-    return true
-}
-
 /**
  * Creates initial user state for a worker
  *
- * Generates a unique email address and sets up initial state based on:
- * - Environment determined from worker info
- * - Default authentication requirement based on environment
+ * Generates a unique email address and sets up initial state.
+ * All tests now require authentication.
  *
- * @param workerInfo - Playwright worker information
+ * @param project - Playwright project information
  * @returns Initial UserState object
  *
  * @example
- * const userState = createInitialUserState(workerInfo);
+ * const userState = createInitialUserState(project);
  * // Returns {
  * //   email: "abc123@namespace.testmail.app",
  * //   isAuthenticated: false,
- * //   environment: "staging",
  * //   requiresAuth: true
  * // }
  */
 export function createInitialUserState(project: Partial<WorkerInfo["project"]>): UserState {
-    const environment = determineEnvironment(project)
     const testmail = getTestmailClient()
 
     // Create email with structured tag
-    const email =
-        process.env.LICENSE === "oss" && process.env.AGENTA_OSS_OWNER_EMAIL
-            ? process.env.AGENTA_OSS_OWNER_EMAIL
-            : testmail.generateTestEmail({
-                  scope: project.name,
-                  branch: process.env.BRANCH_NAME,
-              })
+    const email = testmail.generateTestEmail({
+        scope: project.name,
+        branch: process.env.BRANCH_NAME,
+    })
 
     return {
         email,
         isAuthenticated: false,
-        environment,
         requiresAuth: true,
-        password: process.env.LICENSE === "oss" ? process.env.AGENTA_OSS_OWNER_PASSWORD : "",
+        password: "",
     }
 }
 
diff --git a/web/tests/tests/fixtures/user.fixture/types.ts b/web/tests/tests/fixtures/user.fixture/types.ts
index f815bfc0c7..ef76df6354 100644
--- a/web/tests/tests/fixtures/user.fixture/types.ts
+++ b/web/tests/tests/fixtures/user.fixture/types.ts
@@ -1,4 +1,3 @@
-import type {TestEnvironmentType} from "../../../playwright/config/testTags"
 import type {BaseFixture} from "../base.fixture/types"
 
 import type {AuthHelpers} from "./authHelpers/types"
@@ -7,7 +6,6 @@ export interface UserState {
     email: string
     password?: string
     isAuthenticated: boolean
-    environment: TestEnvironmentType
     requiresAuth: boolean
 }