Overclock-Validator · 7layermagik · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/cmd/mithril/node/node.go b/cmd/mithril/node/node.go
@@ -193,6 +193,7 @@ func init() {
 	Run.Flags().BoolVar(&sbpf.UsePool, "use-pool", true, "Disable to allocate fresh slices")
 
 	// [tuning.pprof] section flags
+	Run.Flags().Int64Var(&pprofPort, "pprof-port", -1, "Port to serve HTTP pprof endpoint")
 	Run.Flags().StringVar(&cpuprofPath, "cpu-profile-path", "", "Filename to write CPU profile")
 
 	// [debug] section flags
@@ -855,7 +856,15 @@ func runVerifyRange(c *cobra.Command, args []string) {
 		klog.Fatalf("end slot cannot be lower than start slot")
 	}
 	mlog.Log.Infof("will replay startSlot=%d endSlot=%d", startSlot, endSlot)
-	accountsDb.InitCaches()
+	accountsDb.InitCaches(
+		config.GetInt("tuning.cache.vote_acct_lru"),
+		config.GetInt("tuning.cache.stake_acct_lru"),
+		config.GetInt("tuning.cache.small_acct_lru"),
+		config.GetInt("tuning.cache.medium_acct_lru"),
+		config.GetInt("tuning.cache.huge_acct_lru"),
+		config.GetInt("tuning.cache.program_lru"),
+		config.GetInt("tuning.cache.seen_once_filter_size"),
+	)
 
 	metricsWriter, metricsWriterCleanup, err := createBufWriter(metricsPath)
 	if err != nil {
@@ -1075,6 +1084,11 @@ func runLive(c *cobra.Command, args []string) {
 	// Now start the metrics server (after banner so errors don't appear first)
 	statsd.StartMetricsServer()
 
+	// Start pprof HTTP server if configured
+	if pprofPort != -1 {
+		startPprofHandlers(int(pprofPort))
+	}
+
 	// Determine if using Lightbringer based on block source
 	// NOTE: Lightbringer mode is TEMPORARILY DISABLED. The background block downloader that
 	// wrote Lightbringer blocks to disk was removed due to reliability issues (panics, race conditions).
@@ -1160,7 +1174,7 @@ func runLive(c *cobra.Command, args []string) {
 
 	// Handle explicit --snapshot flag (bypasses all auto-discovery, does NOT delete snapshot files)
 	if snapshotArchivePath != "" {
-		mlog.Log.Infof("Using snapshot file: %s", snapshotArchivePath)
+		mlog.Log.Infof("Using full snapshot: %s", snapshotArchivePath)
 
 		// Parse full snapshot slot from filename for validation
 		fullSnapshotSlot := parseSlotFromSnapshotName(filepath.Base(snapshotArchivePath))
@@ -1610,7 +1624,15 @@ postBootstrap:
 	}
 
 	liveEndSlot := uint64(math.MaxUint64)
-	accountsDb.InitCaches()
+	accountsDb.InitCaches(
+		config.GetInt("tuning.cache.vote_acct_lru"),
+		config.GetInt("tuning.cache.stake_acct_lru"),
+		config.GetInt("tuning.cache.small_acct_lru"),
+		config.GetInt("tuning.cache.medium_acct_lru"),
+		config.GetInt("tuning.cache.huge_acct_lru"),
+		config.GetInt("tuning.cache.program_lru"),
+		config.GetInt("tuning.cache.seen_once_filter_size"),
+	)
 
 	metricsWriter, metricsWriterCleanup, err := createBufWriter(metricsPath)
 	if err != nil {

diff --git a/cmd/mithril/node/pprof.go b/cmd/mithril/node/pprof.go
@@ -3,6 +3,7 @@ package node
 import (
 	"fmt"
 	"net/http"
+	_ "net/http/pprof" // registers /debug/pprof/* handlers
 	"runtime"
 	"strconv"
 	"time"

diff --git a/config.example.toml b/config.example.toml
@@ -232,13 +232,13 @@ name = "mithril"
     port = 8899
 
 # ============================================================================
-# [tuning] - Performance Tuning & Profiling
+# [development] - Performance Tuning & Profiling
 # ============================================================================
 #
 # Advanced settings for optimizing Mithril's performance.
 # The defaults work well for most deployments.
 
-[tuning]
+[development]
     # Zstd decoder concurrency (defaults to NumCPU)
     # zstd_decoder_concurrency = 16
 
@@ -254,15 +254,57 @@ name = "mithril"
     # Enable/disable pool allocator for slices
     use_pool = true
 
-    # [tuning.pprof] - CPU/Memory Profiling
-    [tuning.pprof]
+    # [development.pprof] - CPU/Memory Profiling
+    [development.pprof]
         # Port to serve HTTP pprof endpoint (-1 to disable)
         # Access at http://localhost:PORT/debug/pprof/
         # port = 6060
 
         # Filename to write CPU profile (for offline analysis)
         # cpu_profile_path = "/tmp/cpuprof.pprof"
 
+    # [development.cache] - AccountsDB LRU Cache Sizes
+    #
+    # These control the LRU caches for fast account data reads during replay.
+    # Values are NUMBER OF ENTRIES, not bytes.
+    #
+    # NOTE: These are DIFFERENT from the global vote/stake caches used for
+    # leader schedule building. Those are unbounded maps that store vote STATE
+    # (voting history, credits) and stake DELEGATIONS. These LRU caches store
+    # full ACCOUNT data for frequently-accessed accounts.
+    #
+    # Larger caches = fewer disk reads, but more memory usage.
+    # Memory per entry is ~200-1000 bytes depending on account data size.
+    [development.cache]
+        # Vote account data cache - number of entries (frequently accessed during replay)
+        vote_acct_lru = 5000
+
+        # Stake account data cache - number of entries (separated to avoid evicting
+        # hot accounts during epoch rewards when ~1.25M stake accounts are touched once each)
+        stake_acct_lru = 2000
+
+        # Small account data cache - accounts ≤512 bytes (token accounts, etc.)
+        small_acct_lru = 50000
+
+        # Medium account data cache - accounts 512-64KB
+        medium_acct_lru = 20000
+
+        # Huge account data cache - accounts >64KB (mostly programs)
+        huge_acct_lru = 500
+
+        # Compiled BPF program cache - number of entries
+        program_lru = 5000
+
+        # Admit-on-second-hit LRU filter for common accounts (small/medium/huge)
+        # Only caches accounts seen twice within the filter window, filtering one-shot reads.
+        #   0 = disabled (cache everything immediately, like traditional LRU)
+        #   >0 = enable filtering with LRU of this capacity
+        # Recommended: 50000 (roughly 0.5-1x of small+medium+huge total)
+        # Monitor SeenOnceAdmitted/SeenOnceFiltered ratio to tune:
+        #   <5-10% admitted = too strict (filter too small)
+        #   >40-50% admitted = too lenient (filter too large or most accesses are hot)
+        seen_once_filter_size = 0
+
 # ============================================================================
 # [debug] - Debug Logging
 # ============================================================================

diff --git a/docs/TODO.md b/docs/TODO.md
@@ -0,0 +1,88 @@
+# TODO / Known Issues
+
+Identified on branch `perf/reward-distribution-optimizations` at commit `3b2ad67`
+dev HEAD at time of identification: `a25b2e3`
+Date: 2026-01-13
+
+---
+
+## Failing Tests
+
+### 1. Address Lookup Table Tests - `InstrErrUnsupportedProgramId`
+
+**File:** `pkg/sealevel/address_lookup_table_test.go`
+**Test:** `TestExecute_AddrLookupTable_Program_Test_Create_Lookup_Table_Idempotent` (and likely all other ALT tests)
+
+**Root Cause:** `AddressLookupTableAddr` and `StakeProgramAddr` were accidentally removed from `resolveNativeProgramById` switch in `pkg/sealevel/native_programs_common.go`.
+
+| Program | Removed In | Commit Date | Commit Message |
+|---------|------------|-------------|----------------|
+| `AddressLookupTableAddr` | `d47c16b` | May 16, 2025 | "many optimisations and changes" |
+| `StakeProgramAddr` | `e890f9e` | Jul 26, 2025 | "snapshot download, stake program migration, refactoring" |
+
+**Fix:** Add these cases back to the switch in `resolveNativeProgramById`:
+```go
+case a.StakeProgramAddr:
+    return StakeProgramExecute, a.StakeProgramAddrStr, nil
+case a.AddressLookupTableAddr:
+    return AddressLookupTableExecute, a.AddressLookupTableProgramAddrStr, nil
+```
+
+---
+
+### 2. Bank Hash Test - Nil Pointer Dereference
+
+**File:** `pkg/replay/hash_test.go`
+**Test:** `Test_Compute_Bank_Hash`
+
+**Error:**
+```
+panic: runtime error: invalid memory address or nil pointer dereference
+pkg/replay/hash.go:227 - shouldIncludeEah(0x0, 0x0)
+```
+
+**Root Cause:** Test passes `nil` for the first argument to `shouldIncludeEah`, which dereferences it without a nil check.
+
+**Fix:** Either add nil check in `shouldIncludeEah` or fix the test to pass valid arguments.
+
+---
+
+## Agave/Firedancer Parity Issues
+
+### 3. Missing "Burned Rewards" Semantics in Reward Distribution
+
+**File:** `pkg/rewards/rewards.go` (lines 180-230)
+
+**Problem:** Mithril does not implement "burn" semantics for per-account failures during partitioned reward distribution. This diverges from both Agave and Firedancer.
+
+**Current Mithril behavior:**
+- `GetAccount` error → panic (aborts replay)
+- `UnmarshalStakeState` error → silent skip (reward lost, not counted)
+- `MarshalStakeStakeInto` error → panic (aborts replay)
+- Lamport overflow → panic (aborts replay)
+
+**Agave behavior** (`distribution.rs:260`):
+- `build_updated_stake_reward` returns `DistributionError::UnableToSetState` or `AccountNotFound`
+- Caller logs error and adds to `lamports_burned`
+- Continues processing remaining accounts
+
+**Firedancer behavior** (`fd_rewards.c:958`):
+- `distribute_epoch_reward_to_stake_acc` returns non-zero on decode/non-stake/etc.
+- Caller increments `lamports_burned` and continues
+
+**Failure scenarios that should burn (not panic):**
+- Account missing / not found
+- Stake state decode fails (including short/invalid data)
+- Account isn't a stake account
+- Lamport add overflows
+- `set_state`/encode fails (e.g., data too small)
+
+**Fix required:**
+1. Add `lamports_burned` tracking to reward distribution
+2. Change panics to log + burn + continue
+3. `epochRewards.Distribute()` should receive `distributedLamports` (successful) separately from burned amount
+4. Ensure `SysvarEpochRewards.DistributedRewards` advances correctly (may need to include burned in total)
+
+**Note:** The current silent skip on `UnmarshalStakeState` error reduces `distributedLamports` but doesn't track it as burned, which may cause `SysvarEpochRewards` to diverge from Agave/FD.
+
+---