diff --git a/dotnet/test/AskUserTests.cs b/dotnet/test/AskUserTests.cs
index c21df302..55a56367 100644
--- a/dotnet/test/AskUserTests.cs
+++ b/dotnet/test/AskUserTests.cs
@@ -8,7 +8,7 @@
 
 namespace GitHub.Copilot.SDK.Test;
 
-public class AskUserTests(E2ETestFixture fixture, ITestOutputHelper output) : E2ETestBase(fixture, "ask-user", output)
+public class AskUserTests(E2ETestFixture fixture, ITestOutputHelper output) : E2ETestBase(fixture, "ask_user", output)
 {
     [Fact]
     public async Task Should_Invoke_User_Input_Handler_When_Model_Uses_Ask_User_Tool()
diff --git a/dotnet/test/McpAndAgentsTests.cs b/dotnet/test/McpAndAgentsTests.cs
index d216032a..f24b7c8b 100644
--- a/dotnet/test/McpAndAgentsTests.cs
+++ b/dotnet/test/McpAndAgentsTests.cs
@@ -8,7 +8,7 @@
 
 namespace GitHub.Copilot.SDK.Test;
 
-public class McpAndAgentsTests(E2ETestFixture fixture, ITestOutputHelper output) : E2ETestBase(fixture, "mcp-and-agents", output)
+public class McpAndAgentsTests(E2ETestFixture fixture, ITestOutputHelper output) : E2ETestBase(fixture, "mcp_and_agents", output)
 {
     [Fact]
     public async Task Should_Accept_MCP_Server_Configuration_On_Session_Create()
diff --git a/go/e2e/testharness/context.go b/go/e2e/testharness/context.go
index 718b08db..5206ee06 100644
--- a/go/e2e/testharness/context.go
+++ b/go/e2e/testharness/context.go
@@ -4,6 +4,7 @@ import (
 	"os"
 	"path/filepath"
 	"regexp"
+	"runtime"
 	"strings"
 	"sync"
 	"testing"
@@ -95,10 +96,23 @@ func (c *TestContext) ConfigureForTest(t *testing.T) {
 
 	// Format: test/snapshots/<testFile>/<testName>.yaml
 	// e.g., test/snapshots/session/should_have_stateful_conversation.yaml
+
+	// Get the test file name from the caller's file path
+	_, callerFile, _, ok := runtime.Caller(1)
+	if !ok {
+		t.Fatal("Failed to get caller information")
+	}
+
+	// Extract test file name: ask_user_test.go -> ask_user
+	testFile := strings.TrimSuffix(filepath.Base(callerFile), "_test.go")
+
+	// Extract and sanitize the subtest name from t.Name()
+	// t.Name() returns "TestAskUser/should_handle_freeform_user_input_response"
 	testName := t.Name()
 	parts := strings.SplitN(testName, "/", 2)
-
-	testFile := strings.ToLower(strings.TrimPrefix(parts[0], "Test"))
+	if len(parts) < 2 {
+		t.Fatalf("Expected test name with subtest, got: %s", testName)
+	}
 	sanitizedName := strings.ToLower(regexp.MustCompile(`[^a-zA-Z0-9]`).ReplaceAllString(parts[1], "_"))
 	snapshotPath := filepath.Join("..", "..", "test", "snapshots", testFile, sanitizedName+".yaml")
 
diff --git a/nodejs/test/e2e/ask-user.test.ts b/nodejs/test/e2e/ask_user.test.ts
similarity index 100%
rename from nodejs/test/e2e/ask-user.test.ts
rename to nodejs/test/e2e/ask_user.test.ts
diff --git a/nodejs/test/e2e/harness/sdkTestContext.ts b/nodejs/test/e2e/harness/sdkTestContext.ts
index ba68bb24..094eaff9 100644
--- a/nodejs/test/e2e/harness/sdkTestContext.ts
+++ b/nodejs/test/e2e/harness/sdkTestContext.ts
@@ -45,6 +45,8 @@ export async function createSdkTestContext({
         cwd: workDir,
         env,
         logLevel: logLevel || "error",
+        // Use fake token in CI to allow cached responses without real auth
+        githubToken: process.env.CI === "true" ? "fake-token-for-e2e-tests" : undefined,
     });
 
     const harness = { homeDir, workDir, openAiEndpoint, copilotClient, env };
@@ -93,7 +95,8 @@ function getTrafficCapturePath(testContext: TestContext): string {
         );
     }
 
-    const testFileName = basename(testFilePath, suffix);
+    // Convert to snake_case for cross-SDK snapshot compatibility
+    const testFileName = basename(testFilePath, suffix).replace(/-/g, "_");
     const taskNameAsFilename = testContext.task.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
     return join(SNAPSHOTS_DIR, testFileName, `${taskNameAsFilename}.yaml`);
 }
diff --git a/nodejs/test/e2e/mcp-and-agents.test.ts b/nodejs/test/e2e/mcp_and_agents.test.ts
similarity index 100%
rename from nodejs/test/e2e/mcp-and-agents.test.ts
rename to nodejs/test/e2e/mcp_and_agents.test.ts
diff --git a/nodejs/test/e2e/session.test.ts b/nodejs/test/e2e/session.test.ts
index 9d5c0ef1..8e086023 100644
--- a/nodejs/test/e2e/session.test.ts
+++ b/nodejs/test/e2e/session.test.ts
@@ -5,7 +5,7 @@ import { CLI_PATH, createSdkTestContext } from "./harness/sdkTestContext.js";
 import { getFinalAssistantMessage, getNextEventOfType } from "./harness/sdkTestHelper.js";
 
 describe("Sessions", async () => {
-    const { copilotClient: client, openAiEndpoint, homeDir } = await createSdkTestContext();
+    const { copilotClient: client, openAiEndpoint, homeDir, env } = await createSdkTestContext();
 
     it("should create and destroy sessions", async () => {
         const session = await client.createSession({ model: "fake-test-model" });
@@ -158,11 +158,8 @@ describe("Sessions", async () => {
         // Resume using a new client
         const newClient = new CopilotClient({
             cliPath: CLI_PATH,
-            env: {
-                ...process.env,
-                XDG_CONFIG_HOME: homeDir,
-                XDG_STATE_HOME: homeDir,
-            },
+            env,
+            githubToken: process.env.CI === "true" ? "fake-token-for-e2e-tests" : undefined,
         });
 
         onTestFinished(() => newClient.forceStop());
@@ -387,7 +384,9 @@ describe("Send Blocking Behavior", async () => {
         expect(events).toContain("assistant.message");
     });
 
-    it("sendAndWait throws on timeout", async () => {
+    // Skip in CI - this test validates client-side timeout behavior, not LLM responses.
+    // The test intentionally times out before receiving a response, so there's no snapshot to replay.
+    it.skipIf(process.env.CI === "true")("sendAndWait throws on timeout", async () => {
         const session = await client.createSession();
 
         // Use a slow command to ensure timeout triggers before completion
diff --git a/python/e2e/test_ask_user.py b/python/e2e/test_ask_user.py
index 282b529b..93036ea4 100644
--- a/python/e2e/test_ask_user.py
+++ b/python/e2e/test_ask_user.py
@@ -10,7 +10,7 @@
 
 
 class TestAskUser:
-    async def test_invoke_user_input_handler_when_model_uses_ask_user_tool(
+    async def test_should_invoke_user_input_handler_when_model_uses_ask_user_tool(
         self, ctx: E2ETestContext
     ):
         """Test that user input handler is invoked when model uses ask_user tool"""
@@ -48,7 +48,7 @@ async def on_user_input_request(request, invocation):
 
         await session.destroy()
 
-    async def test_receive_choices_in_user_input_request(self, ctx: E2ETestContext):
+    async def test_should_receive_choices_in_user_input_request(self, ctx: E2ETestContext):
         """Test that choices are received in user input request"""
         user_input_requests = []
 
@@ -84,7 +84,7 @@ async def on_user_input_request(request, invocation):
 
         await session.destroy()
 
-    async def test_handle_freeform_user_input_response(self, ctx: E2ETestContext):
+    async def test_should_handle_freeform_user_input_response(self, ctx: E2ETestContext):
         """Test that freeform user input responses work"""
         user_input_requests = []
         freeform_answer = "This is my custom freeform answer that was not in the choices"
diff --git a/python/e2e/test_hooks.py b/python/e2e/test_hooks.py
index 57aaee18..b64628e0 100644
--- a/python/e2e/test_hooks.py
+++ b/python/e2e/test_hooks.py
@@ -11,7 +11,7 @@
 
 
 class TestHooks:
-    async def test_invoke_pre_tool_use_hook_when_model_runs_a_tool(self, ctx: E2ETestContext):
+    async def test_should_invoke_pretooluse_hook_when_model_runs_a_tool(self, ctx: E2ETestContext):
         """Test that preToolUse hook is invoked when model runs a tool"""
         pre_tool_use_inputs = []
 
@@ -38,7 +38,9 @@ async def on_pre_tool_use(input_data, invocation):
 
         await session.destroy()
 
-    async def test_invoke_post_tool_use_hook_after_model_runs_a_tool(self, ctx: E2ETestContext):
+    async def test_should_invoke_posttooluse_hook_after_model_runs_a_tool(
+        self, ctx: E2ETestContext
+    ):
         """Test that postToolUse hook is invoked after model runs a tool"""
         post_tool_use_inputs = []
 
@@ -65,7 +67,9 @@ async def on_post_tool_use(input_data, invocation):
 
         await session.destroy()
 
-    async def test_invoke_both_hooks_for_single_tool_call(self, ctx: E2ETestContext):
+    async def test_should_invoke_both_pretooluse_and_posttooluse_hooks_for_a_single_tool_call(
+        self, ctx: E2ETestContext
+    ):
         """Test that both preToolUse and postToolUse hooks fire for the same tool call"""
         pre_tool_use_inputs = []
         post_tool_use_inputs = []
@@ -103,7 +107,9 @@ async def on_post_tool_use(input_data, invocation):
 
         await session.destroy()
 
-    async def test_deny_tool_execution_when_pre_tool_use_returns_deny(self, ctx: E2ETestContext):
+    async def test_should_deny_tool_execution_when_pretooluse_returns_deny(
+        self, ctx: E2ETestContext
+    ):
         """Test that returning deny in preToolUse prevents tool execution"""
         pre_tool_use_inputs = []
 
diff --git a/python/e2e/test_mcp_and_agents.py b/python/e2e/test_mcp_and_agents.py
index 95738d5e..bfff6c09 100644
--- a/python/e2e/test_mcp_and_agents.py
+++ b/python/e2e/test_mcp_and_agents.py
@@ -12,7 +12,9 @@
 
 
 class TestMCPServers:
-    async def test_accept_mcp_server_config_on_create(self, ctx: E2ETestContext):
+    async def test_should_accept_mcp_server_configuration_on_session_create(
+        self, ctx: E2ETestContext
+    ):
         """Test that MCP server configuration is accepted on session create"""
         mcp_servers: dict[str, MCPServerConfig] = {
             "test-server": {
@@ -34,7 +36,9 @@ async def test_accept_mcp_server_config_on_create(self, ctx: E2ETestContext):
 
         await session.destroy()
 
-    async def test_accept_mcp_server_config_on_resume(self, ctx: E2ETestContext):
+    async def test_should_accept_mcp_server_configuration_on_session_resume(
+        self, ctx: E2ETestContext
+    ):
         """Test that MCP server configuration is accepted on session resume"""
         # Create a session first
         session1 = await ctx.client.create_session()
@@ -61,31 +65,11 @@ async def test_accept_mcp_server_config_on_resume(self, ctx: E2ETestContext):
 
         await session2.destroy()
 
-    async def test_handle_multiple_mcp_servers(self, ctx: E2ETestContext):
-        """Test that multiple MCP servers can be configured"""
-        mcp_servers: dict[str, MCPServerConfig] = {
-            "server1": {
-                "type": "local",
-                "command": "echo",
-                "args": ["server1"],
-                "tools": ["*"],
-            },
-            "server2": {
-                "type": "local",
-                "command": "echo",
-                "args": ["server2"],
-                "tools": ["*"],
-            },
-        }
-
-        session = await ctx.client.create_session({"mcp_servers": mcp_servers})
-
-        assert session.session_id is not None
-        await session.destroy()
-
 
 class TestCustomAgents:
-    async def test_accept_custom_agent_config_on_create(self, ctx: E2ETestContext):
+    async def test_should_accept_custom_agent_configuration_on_session_create(
+        self, ctx: E2ETestContext
+    ):
         """Test that custom agent configuration is accepted on session create"""
         custom_agents: list[CustomAgentConfig] = [
             {
@@ -108,7 +92,9 @@ async def test_accept_custom_agent_config_on_create(self, ctx: E2ETestContext):
 
         await session.destroy()
 
-    async def test_accept_custom_agent_config_on_resume(self, ctx: E2ETestContext):
+    async def test_should_accept_custom_agent_configuration_on_session_resume(
+        self, ctx: E2ETestContext
+    ):
         """Test that custom agent configuration is accepted on session resume"""
         # Create a session first
         session1 = await ctx.client.create_session()
@@ -135,74 +121,9 @@ async def test_accept_custom_agent_config_on_resume(self, ctx: E2ETestContext):
 
         await session2.destroy()
 
-    async def test_handle_custom_agent_with_tools(self, ctx: E2ETestContext):
-        """Test that custom agent with tools configuration is accepted"""
-        custom_agents: list[CustomAgentConfig] = [
-            {
-                "name": "tool-agent",
-                "display_name": "Tool Agent",
-                "description": "An agent with specific tools",
-                "prompt": "You are an agent with specific tools.",
-                "tools": ["bash", "edit"],
-                "infer": True,
-            }
-        ]
-
-        session = await ctx.client.create_session({"custom_agents": custom_agents})
-
-        assert session.session_id is not None
-        await session.destroy()
-
-    async def test_handle_custom_agent_with_mcp_servers(self, ctx: E2ETestContext):
-        """Test that custom agent with its own MCP servers is accepted"""
-        custom_agents: list[CustomAgentConfig] = [
-            {
-                "name": "mcp-agent",
-                "display_name": "MCP Agent",
-                "description": "An agent with its own MCP servers",
-                "prompt": "You are an agent with MCP servers.",
-                "mcp_servers": {
-                    "agent-server": {
-                        "type": "local",
-                        "command": "echo",
-                        "args": ["agent-mcp"],
-                        "tools": ["*"],
-                    }
-                },
-            }
-        ]
-
-        session = await ctx.client.create_session({"custom_agents": custom_agents})
-
-        assert session.session_id is not None
-        await session.destroy()
-
-    async def test_handle_multiple_custom_agents(self, ctx: E2ETestContext):
-        """Test that multiple custom agents can be configured"""
-        custom_agents: list[CustomAgentConfig] = [
-            {
-                "name": "agent1",
-                "display_name": "Agent One",
-                "description": "First agent",
-                "prompt": "You are agent one.",
-            },
-            {
-                "name": "agent2",
-                "display_name": "Agent Two",
-                "description": "Second agent",
-                "prompt": "You are agent two.",
-                "infer": False,
-            },
-        ]
-
-        session = await ctx.client.create_session({"custom_agents": custom_agents})
-
-        assert session.session_id is not None
-        await session.destroy()
-
 
 class TestCombinedConfiguration:
-    async def test_accept_mcp_servers_and_custom_agents(self, ctx: E2ETestContext):
+    async def test_should_accept_both_mcp_servers_and_custom_agents(self, ctx: E2ETestContext):
         """Test that both MCP servers and custom agents can be configured together"""
         mcp_servers: dict[str, MCPServerConfig] = {
             "shared-server": {
diff --git a/python/e2e/test_permissions.py b/python/e2e/test_permissions.py
index eedfbe9a..7635219d 100644
--- a/python/e2e/test_permissions.py
+++ b/python/e2e/test_permissions.py
@@ -15,7 +15,7 @@
 
 
 class TestPermissions:
-    async def test_permission_handler_for_write_operations(self, ctx: E2ETestContext):
+    async def test_should_invoke_permission_handler_for_write_operations(self, ctx: E2ETestContext):
         """Test that permission handler is invoked for write operations"""
         permission_requests = []
 
@@ -44,28 +44,7 @@ def on_permission_request(
 
         await session.destroy()
 
-    async def test_permission_handler_for_shell_commands(self, ctx: E2ETestContext):
-        """Test that permission handler is invoked for shell commands"""
-        permission_requests = []
-
-        def on_permission_request(
-            request: PermissionRequest, invocation: dict
-        ) -> PermissionRequestResult:
-            permission_requests.append(request)
-            # Approve the permission
-            return {"kind": "approved"}
-
-        session = await ctx.client.create_session({"on_permission_request": on_permission_request})
-
-        await session.send_and_wait({"prompt": "Run 'echo hello' and tell me the output"})
-
-        # Should have received at least one shell permission request
-        shell_requests = [req for req in permission_requests if req.get("kind") == "shell"]
-        assert len(shell_requests) > 0
-
-        await session.destroy()
-
-    async def test_deny_permission(self, ctx: E2ETestContext):
+    async def test_should_deny_permission_when_handler_returns_denied(self, ctx: E2ETestContext):
         """Test denying permissions"""
 
         def on_permission_request(
@@ -89,7 +68,9 @@ def on_permission_request(
 
         await session.destroy()
 
-    async def test_without_permission_handler(self, ctx: E2ETestContext):
+    async def test_should_work_without_permission_handler__default_behavior_(
+        self, ctx: E2ETestContext
+    ):
         """Test that sessions work without permission handler (default behavior)"""
         # Create session without on_permission_request handler
         session = await ctx.client.create_session()
@@ -101,7 +82,7 @@ async def test_without_permission_handler(self, ctx: E2ETestContext):
 
         await session.destroy()
 
-    async def test_async_permission_handler(self, ctx: E2ETestContext):
+    async def test_should_handle_async_permission_handler(self, ctx: E2ETestContext):
         """Test async permission handler"""
         permission_requests = []
 
@@ -121,7 +102,7 @@ async def on_permission_request(
 
         await session.destroy()
 
-    async def test_resume_session_with_permission_handler(self, ctx: E2ETestContext):
+    async def test_should_resume_session_with_permission_handler(self, ctx: E2ETestContext):
         """Test resuming session with permission handler"""
         permission_requests = []
 
@@ -148,7 +129,7 @@ def on_permission_request(
 
         await session2.destroy()
 
-    async def test_permission_handler_errors(self, ctx: E2ETestContext):
+    async def test_should_handle_permission_handler_errors_gracefully(self, ctx: E2ETestContext):
         """Test that permission handler errors are handled gracefully"""
 
         def on_permission_request(
@@ -169,7 +150,7 @@ def on_permission_request(
 
         await session.destroy()
 
-    async def test_tool_call_id_in_permission_requests(self, ctx: E2ETestContext):
+    async def test_should_receive_toolcallid_in_permission_requests(self, ctx: E2ETestContext):
         """Test that toolCallId is included in permission requests"""
         received_tool_call_id = False
 
diff --git a/python/e2e/test_session.py b/python/e2e/test_session.py
index 3cd18852..f2e545ed 100644
--- a/python/e2e/test_session.py
+++ b/python/e2e/test_session.py
@@ -1,5 +1,7 @@
 """E2E Session Tests"""
 
+import os
+
 import pytest
 
 from copilot import CopilotClient
@@ -158,8 +160,14 @@ async def test_should_resume_a_session_using_a_new_client(self, ctx: E2ETestCont
         assert "2" in answer.data.content
 
         # Resume using a new client
+        github_token = "fake-token-for-e2e-tests" if os.environ.get("CI") == "true" else None
         new_client = CopilotClient(
-            {"cli_path": ctx.cli_path, "cwd": ctx.work_dir, "env": ctx.get_env()}
+            {
+                "cli_path": ctx.cli_path,
+                "cwd": ctx.work_dir,
+                "env": ctx.get_env(),
+                "github_token": github_token,
+            }
         )
 
         try:
diff --git a/python/e2e/testharness/context.py b/python/e2e/testharness/context.py
index 359fc844..e0b8ea4e 100644
--- a/python/e2e/testharness/context.py
+++ b/python/e2e/testharness/context.py
@@ -65,11 +65,14 @@ async def setup(self):
         self.proxy_url = await self._proxy.start()
 
         # Create the shared client (like Node.js/Go do)
+        # Use fake token in CI to allow cached responses without real auth
+        github_token = "fake-token-for-e2e-tests" if os.environ.get("CI") == "true" else None
         self._client = CopilotClient(
             {
                 "cli_path": self.cli_path,
                 "cwd": self.work_dir,
                 "env": self.get_env(),
+                "github_token": github_token,
             }
         )
 
diff --git a/test/harness/replayingCapiProxy.ts b/test/harness/replayingCapiProxy.ts
index b48a5b50..a700596b 100644
--- a/test/harness/replayingCapiProxy.ts
+++ b/test/harness/replayingCapiProxy.ts
@@ -32,6 +32,12 @@ const normalizedToolNames = {
   [shellConfig.writeShellToolName]: "${write_shell}",
 };
 
+/**
+ * Default model to use when no stored data is available for a given test.
+ * This enables responding to /models without needing to have a capture file.
+ */
+const defaultModel = "claude-sonnet-4.5";
+
 /**
  * An HTTP proxy that not only captures HTTP exchanges, but also stores them in a file on disk and
  * replays the stored responses on subsequent runs.
@@ -149,7 +155,9 @@ export class ReplayingCapiProxy extends CapturingHttpProxy {
           options.requestOptions.path?.startsWith("/stop") &&
           options.requestOptions.method === "POST"
         ) {
-          const skipWritingCache = options.requestOptions.path.includes("skipWritingCache=true");
+          const skipWritingCache = options.requestOptions.path.includes(
+            "skipWritingCache=true",
+          );
           options.onResponseStart(200, {});
           options.onResponseEnd();
           await this.stop(skipWritingCache);
@@ -184,13 +192,13 @@ export class ReplayingCapiProxy extends CapturingHttpProxy {
         }
 
         // Handle /models endpoint
-        if (
-          options.requestOptions.path === "/models" &&
-          state.storedData?.models.length
-        ) {
-          const modelsResponse = createGetModelsResponse(
-            state.storedData.models,
-          );
+        // Use stored models if available, otherwise use default model
+        if (options.requestOptions.path === "/models") {
+          const models =
+            state.storedData?.models && state.storedData.models.length > 0
+              ? state.storedData.models
+              : [defaultModel];
+          const modelsResponse = createGetModelsResponse(models);
           const body = JSON.stringify(modelsResponse);
           const headers = {
             "content-type": "application/json",
@@ -202,6 +210,27 @@ export class ReplayingCapiProxy extends CapturingHttpProxy {
           return;
         }
 
+        // Handle memory endpoints - return stub responses in tests
+        // Matches: /agents/*/memory/*/enabled, /agents/*/memory/*/recent, etc.
+        if (options.requestOptions.path?.match(/\/agents\/.*\/memory\//)) {
+          let body: string;
+          if (options.requestOptions.path.includes("/enabled")) {
+            body = JSON.stringify({ enabled: false });
+          } else if (options.requestOptions.path.includes("/recent")) {
+            body = JSON.stringify({ memories: [] });
+          } else {
+            body = JSON.stringify({});
+          }
+          const headers = {
+            "content-type": "application/json",
+            ...commonResponseHeaders,
+          };
+          options.onResponseStart(200, headers);
+          options.onData(Buffer.from(body));
+          options.onResponseEnd();
+          return;
+        }
+
         // Handle /chat/completions endpoint
         if (
           state.storedData &&
@@ -257,7 +286,7 @@ export class ReplayingCapiProxy extends CapturingHttpProxy {
         // Fallback to normal proxying if no cached response found
         // This implicitly captures the new exchange too
         if (process.env.CI === "true") {
-          await emitNoMatchingRequestWarning(
+          await exitWithNoMatchingRequestError(
             options,
             state.testInfo,
             state.workDir,
@@ -295,7 +324,7 @@ async function writeCapturesToDisk(
   }
 }
 
-async function emitNoMatchingRequestWarning(
+async function exitWithNoMatchingRequestError(
   options: PerformRequestOptions,
   testInfo: { file: string; line?: number } | undefined,
   workDir: string,
@@ -305,18 +334,27 @@ async function emitNoMatchingRequestWarning(
   if (testInfo?.file) parts.push(`file=${testInfo.file}`);
   if (typeof testInfo?.line === "number") parts.push(`line=${testInfo.line}`);
   const header = parts.length ? ` ${parts.join(",")}` : "";
-  const normalized = await parseAndNormalizeRequest(
-    options.body,
-    workDir,
-    toolResultNormalizers,
-  );
-  const normalizedMessages = normalized.conversations[0]?.messages ?? [];
-  const warningMessage =
-    `No cached response found for ${options.requestOptions.method} ${options.requestOptions.path}. ` +
-    `Final message: ${JSON.stringify(
+
+  let finalMessageInfo: string;
+  try {
+    const normalized = await parseAndNormalizeRequest(
+      options.body,
+      workDir,
+      toolResultNormalizers,
+    );
+    const normalizedMessages = normalized.conversations[0]?.messages ?? [];
+    finalMessageInfo = JSON.stringify(
       normalizedMessages[normalizedMessages.length - 1],
-    )}`;
-  process.stderr.write(`::warning${header}::${warningMessage}\n`);
+    );
+  } catch {
+    finalMessageInfo = `(unable to parse request body: ${options.body?.slice(0, 200) ?? "empty"})`;
+  }
+
+  const errorMessage =
+    `No cached response found for ${options.requestOptions.method} ${options.requestOptions.path}. ` +
+    `Final message: ${finalMessageInfo}`;
+  process.stderr.write(`::error${header}::${errorMessage}\n`);
+  options.onError(new Error(errorMessage));
 }
 
 async function findSavedChatCompletionResponse(
diff --git a/test/snapshots/ask_user/should_handle_freeform_user_input_response.yaml b/test/snapshots/ask_user/should_handle_freeform_user_input_response.yaml
new file mode 100644
index 00000000..437cc2dd
--- /dev/null
+++ b/test/snapshots/ask_user/should_handle_freeform_user_input_response.yaml
@@ -0,0 +1,21 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: Ask me a question using ask_user and then include my answer in your response. The question should be 'What is
+          your favorite color?'
+      - role: assistant
+        tool_calls:
+          - id: toolcall_0
+            type: function
+            function:
+              name: ask_user
+              arguments: '{"question":"What is your favorite color?"}'
+      - role: tool
+        tool_call_id: toolcall_0
+        content: "User response (freeform): This is my custom freeform answer that was not in the choices"
+      - role: assistant
+        content: 'You answered: "This is my custom freeform answer that was not in the choices"'
diff --git a/test/snapshots/ask_user/should_invoke_user_input_handler_when_model_uses_ask_user_tool.yaml b/test/snapshots/ask_user/should_invoke_user_input_handler_when_model_uses_ask_user_tool.yaml
new file mode 100644
index 00000000..beb7a584
--- /dev/null
+++ b/test/snapshots/ask_user/should_invoke_user_input_handler_when_model_uses_ask_user_tool.yaml
@@ -0,0 +1,21 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: Ask me to choose between 'Option A' and 'Option B' using the ask_user tool. Wait for my response before
+          continuing.
+      - role: assistant
+        tool_calls:
+          - id: toolcall_0
+            type: function
+            function:
+              name: ask_user
+              arguments: '{"question":"Please choose between the following options:","choices":["Option A","Option B"]}'
+      - role: tool
+        tool_call_id: toolcall_0
+        content: "User selected: Option A"
+      - role: assistant
+        content: You selected **Option A**. How would you like to proceed?
diff --git a/test/snapshots/ask_user/should_receive_choices_in_user_input_request.yaml b/test/snapshots/ask_user/should_receive_choices_in_user_input_request.yaml
new file mode 100644
index 00000000..70537806
--- /dev/null
+++ b/test/snapshots/ask_user/should_receive_choices_in_user_input_request.yaml
@@ -0,0 +1,21 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: "Use the ask_user tool to ask me to pick between exactly two options: 'Red' and 'Blue'. These should be
+          provided as choices. Wait for my answer."
+      - role: assistant
+        tool_calls:
+          - id: toolcall_0
+            type: function
+            function:
+              name: ask_user
+              arguments: '{"question":"Please pick one of the following options:","choices":["Red","Blue"],"allow_freeform":false}'
+      - role: tool
+        tool_call_id: toolcall_0
+        content: "User selected: Red"
+      - role: assistant
+        content: You selected **Red**.
diff --git a/test/snapshots/mcp_and_agents/should_accept_both_mcp_servers_and_custom_agents.yaml b/test/snapshots/mcp_and_agents/should_accept_both_mcp_servers_and_custom_agents.yaml
new file mode 100644
index 00000000..60d1eade
--- /dev/null
+++ b/test/snapshots/mcp_and_agents/should_accept_both_mcp_servers_and_custom_agents.yaml
@@ -0,0 +1,10 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: What is 7+7?
+      - role: assistant
+        content: 7 + 7 = 14
diff --git a/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_create.yaml b/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_create.yaml
new file mode 100644
index 00000000..56da15ba
--- /dev/null
+++ b/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_create.yaml
@@ -0,0 +1,10 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: What is 5+5?
+      - role: assistant
+        content: 5 + 5 = 10
diff --git a/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_resume.yaml b/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_resume.yaml
new file mode 100644
index 00000000..9703495c
--- /dev/null
+++ b/test/snapshots/mcp_and_agents/should_accept_custom_agent_configuration_on_session_resume.yaml
@@ -0,0 +1,14 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: What is 1+1?
+      - role: assistant
+        content: 1+1 equals 2.
+      - role: user
+        content: What is 6+6?
+      - role: assistant
+        content: 6+6 equals 12.
diff --git a/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_create.yaml b/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_create.yaml
new file mode 100644
index 00000000..9fe2fcd0
--- /dev/null
+++ b/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_create.yaml
@@ -0,0 +1,10 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: What is 2+2?
+      - role: assistant
+        content: 2 + 2 = 4
diff --git a/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_resume.yaml b/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_resume.yaml
new file mode 100644
index 00000000..82c9917c
--- /dev/null
+++ b/test/snapshots/mcp_and_agents/should_accept_mcp_server_configuration_on_session_resume.yaml
@@ -0,0 +1,14 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: What is 1+1?
+      - role: assistant
+        content: 1 + 1 = 2
+      - role: user
+        content: What is 3+3?
+      - role: assistant
+        content: 3 + 3 = 6
diff --git a/test/snapshots/permissions/should_receive_toolcallid_in_permission_requests.yaml b/test/snapshots/permissions/should_receive_toolcallid_in_permission_requests.yaml
index 63fde597..b529a01e 100644
--- a/test/snapshots/permissions/should_receive_toolcallid_in_permission_requests.yaml
+++ b/test/snapshots/permissions/should_receive_toolcallid_in_permission_requests.yaml
@@ -46,4 +46,4 @@ conversations:
           test
           <exited with exit code 0>
       - role: assistant
-        content: The command executed successfully and output "test".
+        content: Command executed successfully, output is "test".
diff --git a/test/snapshots/session/send_returns_immediately_while_events_stream_in_background.yaml b/test/snapshots/session/send_returns_immediately_while_events_stream_in_background.yaml
index 8deef905..6da08758 100644
--- a/test/snapshots/session/send_returns_immediately_while_events_stream_in_background.yaml
+++ b/test/snapshots/session/send_returns_immediately_while_events_stream_in_background.yaml
@@ -12,14 +12,15 @@ conversations:
             type: function
             function:
               name: report_intent
-              arguments: '{"intent":"Running sleep command"}'
+              arguments: '{"intent":"Running command"}'
       - role: assistant
         tool_calls:
           - id: toolcall_1
             type: function
             function:
               name: ${shell}
-              arguments: '{"command":"sleep 2 && echo done","description":"Run sleep 2 and echo done","initial_wait":5}'
+              arguments: '{"command":"sleep 2 && echo done","description":"Run sleep and echo
+                command","initial_wait":5,"mode":"sync"}'
   - messages:
       - role: system
         content: ${system}
@@ -31,12 +32,13 @@ conversations:
             type: function
             function:
               name: report_intent
-              arguments: '{"intent":"Running sleep command"}'
+              arguments: '{"intent":"Running command"}'
           - id: toolcall_1
             type: function
             function:
               name: ${shell}
-              arguments: '{"command":"sleep 2 && echo done","description":"Run sleep 2 and echo done","initial_wait":5}'
+              arguments: '{"command":"sleep 2 && echo done","description":"Run sleep and echo
+                command","initial_wait":5,"mode":"sync"}'
       - role: tool
         tool_call_id: toolcall_0
         content: Intent logged
@@ -46,4 +48,4 @@ conversations:
           done
           <exited with exit code 0>
       - role: assistant
-        content: The command completed successfully after a 2-second sleep and output "done".
+        content: Command completed successfully. The output is "done".
diff --git a/test/snapshots/session/sendandwait_throws_on_timeout.yaml b/test/snapshots/session/sendandwait_throws_on_timeout.yaml
new file mode 100644
index 00000000..0e019bda
--- /dev/null
+++ b/test/snapshots/session/sendandwait_throws_on_timeout.yaml
@@ -0,0 +1,8 @@
+models:
+  - claude-sonnet-4.5
+conversations:
+  - messages:
+      - role: system
+        content: ${system}
+      - role: user
+        content: Run 'sleep 2 && echo done'