diff --git a/.claude/plans/variant-switch-br-table.md b/.claude/plans/variant-switch-br-table.md
new file mode 100644
index 00000000000..b229378fcfb
--- /dev/null
+++ b/.claude/plans/variant-switch-br-table.md
@@ -0,0 +1,566 @@
+# Plan: Masked `br_table` Dispatch for Variant Switches in `moc`
+
+## Motivation
+
+Motoko variant tags are 32-bit hashes of string labels. A `switch` on a
+variant with `n` arms currently compiles to a linear chain of `n` hash
+comparisons (`orsPatternFailure`). The existing special cases are:
+
+- **1-arm**: skip tag check entirely (`single_case`)
+- **2-arm**: skip the second comparison (`simplify_cases`)
+- **n-arm**: linear chain — O(n) comparisons
+
+This plan adds an O(1) dispatch for the n-arm case using Wasm `br_table`
+and a compile-time-chosen bitmask.
+
+## Core Idea
+
+Given variant hashes `h₁, h₂, ..., hₙ`, find a mask `M` at compile time
+such that `hᵢ & M` are all distinct. Also compute `S = ctz(M)` (count of
+trailing zero bits in `M`) at compile time. Then emit at runtime:
+
+```wasm
+local.get $tag
+i32.const M     ;; compile-time constant
+i32.and         ;; runtime: mask the tag
+i32.const S     ;; compile-time constant: S = ctz(M) — both lines
+i32.shr_u       ;; omitted entirely when S = 0 (would be a nop)
+br_table $tbl[0] $tbl[1] ... $default
+```
+
+The right-shift eliminates the trailing zero bits of `M`, reducing the
+effective table size from `max(hᵢ & M) + 1` to
+`(max(hᵢ & M) >> S) + 1` — potentially much smaller.
+
+## Finding M and S (OCaml, all at compile time)
+
+**Algorithm:** iterate integers `n` from 1 upward, filtering for
+`popcount(n) = ceil(log₂(arms))` (the minimum bits needed to distinguish
+`arms` values — exactly `k` bits can represent 2^k distinct indices).
+For each candidate mask, check if masking all hashes is injective.
+Return the first that passes — it is automatically compact because small
+integers concentrate their set bits in low positions, keeping `max(hᵢ & M)`
+small and thus the table size small after the `>> S` shift.
+
+Note: `floor(log₂(n)) + 1` equals `ceil(log₂(n))` only when `n` is *not*
+a power of 2; for powers of 2 it overshoots by 1. Use `ceil` directly.
+
+```ocaml
+(* All computation below happens during Wasm code generation, not at runtime *)
+
+let popcount32 m = (* count set bits in int32 *) ...
+let ctz32 m = (* count trailing zeros; 32 if m = 0l *) ...
+
+(* ceil(log₂(n)) — minimum bits to index n distinct values *)
+let bits_needed n =
+  let rec f k = if 1 lsl k >= n then k else f (k + 1) in
+  f 1
+
+let is_injective mask hashes =
+  let masked = List.map (Int32.logand mask) hashes in
+  List.length masked = List.length (List.sort_uniq compare masked)
+
+let find_mask arms hashes threshold =
+  let required_bits = bits_needed arms in
+  (* Iterate n = 1, 2, 3, ... filtering for popcount = required_bits *)
+  let rec loop n =
+    if n = 0l then None  (* wrapped around — give up, use linear *)
+    else if popcount32 n = required_bits && is_injective n hashes then
+      let s = ctz32 n in
+      let tbl_size = (* max((hᵢ & n) >> s) + 1, computed at compile time *) ... in
+      if tbl_size <= threshold then Some (n, s, tbl_size)
+      else loop (Int32.add n 1l)  (* mask valid but table too big: keep trying *)
+    else loop (Int32.add n 1l)
+  in
+  loop 1l
+```
+
+If no mask with `required_bits` bits fits in the threshold, try
+`required_bits + 1`, etc., up to a small maximum (e.g. 8 bits).
+
+**Threshold:** e.g. `max(64, 4 * n)`.
+
+## Wasm Block Structure (runtime)
+
+`br_table` with index `i` exits the `i`-th enclosing block.
+
+```wasm
+block $exit
+  block $default
+    block $arm_{n-1}
+      ...
+        block $arm_0
+          local.get $tag
+          i32.const M          ;; compile-time constant
+          i32.and              ;; runtime
+          ;; only emit next two instructions when S > 0:
+          i32.const S          ;; compile-time constant, S = ctz(M)
+          i32.shr_u            ;; runtime — omitted entirely when S = 0
+          br_table $tbl[0] .. $tbl[tbl_size-1] $default
+        end ;; $arm_0
+        <body of arm_0>
+        br $exit
+      ...
+    end ;; $arm_{n-1}
+    <body of arm_{n-1}>
+    br $exit
+  end ;; $default
+  unreachable   ;; type-safe: dead code
+end ;; $exit
+```
+
+`$tbl[j]` = `$arm_k` where `(hₖ & M) >> S = j`, else `$default`.
+
+## Break-even Analysis
+
+### Dynamic instruction count
+
+Each arm in the current linear chain costs ~6 instructions:
+`local.get` + 2 heap loads (forwarding ptr + tag field) + `i32.const` + `i32.eq` + `br_if`.
+The chain stops at the first match, so:
+
+| Path | Instructions executed |
+|------|-----------------------|
+| Linear (worst case, last arm) | `6n` |
+| Linear (average, uniform input) | `3n` |
+| br_table, S = 0 | **5** (get + 2 loads + const M + and + br_table) |
+| br_table, S > 0 | **7** (+ const S + shr_u) |
+
+**Worst-case break-even:** n ≥ 2 for both variants (5 < 12, 7 < 12).
+**Average break-even:** n ≥ 3 (5 < 9, 7 < 9).
+
+Since n = 1 and n = 2 are already handled by `single_case` / `simplify_cases`,
+the `br_table` path is a strict win for every case it applies to (n ≥ 3).
+
+### Static code size
+
+Approximating ~4 bytes/instruction and ~4 bytes/br_table entry:
+
+| Path | Bytes |
+|------|-------|
+| Linear | `~26n` (6 instrs/arm + block overhead) |
+| br_table, S = 0 | `~20 + 4 × table_size + 2n` |
+| br_table, S > 0 | `~28 + 4 × table_size + 2n` |
+
+Code-size break-even (S = 0): `table_size ≤ 6(n − 1)` → e.g. n=3 allows table_size ≤ 12.
+Code-size break-even (S > 0): `table_size ≤ 6n − 8` → e.g. n=3 allows table_size ≤ 10.
+
+The threshold `max(64, 4n)` is well within these limits, so code size is
+never worse when the threshold is respected.
+
+### Empirical measurement (TODO)
+
+Add a `bench` test that calls a large variant switch (e.g. 8-arm, 16-arm)
+in a tight loop, comparing the old linear output (forced via flag or saved
+`.wat`) against the optimised one. The existing `bench` package in the
+test suite can measure Wasm instruction counts or wall-clock cycles.
+
+Note: Wasm JIT compilers (wasmtime, V8) typically lower `br_table` to a
+hardware jump table, giving an additional constant-factor speedup over
+what static instruction counts suggest.
+
+## Where to Apply the Optimisation
+
+Three candidate insertion points, evolved over the life of the PR:
+
+**Option A — IR level** (`SwitchE` with `TagP` arms, in `compile_classical.ml`)
+— the shipped V1. Easy to reach labels, hashes, and exhaustiveness,
+but only sees one IR shape: a flat list of `TagP` arms with distinct bodies.
+
+**Option B — Wasm peephole** (scan `i32.const hash / i32.eq / br_if`
+chains and rewrite). Rejected: labels must be re-decoded from operands,
+exhaustiveness is not locally known, `Wasm.AST` is functional (replacement
+is unnatural), and no existing peephole infrastructure to attach to.
+
+**Option C — pattern-code EDSL with an algebraic-effect strategy query**
+— the V2 proposal. `compile_enhanced.ml:10635` composes pattern-matching
+via `patternCode` values combined with `(^^^)`. Both flat variant arms
+and or-pattern arms end up producing the *same* kind of "compare tag
+hash H, branch to body B" fragment at this level — the IR-shape
+distinction that Option A is blind to has already been elaborated away.
+
+### Why V1 is not enough
+
+`isWeekday` (7 flat arms: `#mon → true; #tue → true; ...; #sun → false`)
+optimises to `br_table`. The semantically-equal `isWeekdayOr`
+(or-pattern: `case (#mon | #tue | ... | #fri) true; case _ false`)
+does not — at IR the or-pattern is a *single* arm with a disjunctive
+pattern, so `SwitchE` sees 2 arms and the 4-arm threshold rejects it.
+Both should dispatch identically.
+
+### Architecture: Handler / Recognizer split
+
+Two roles, connected by an algebraic-effect protocol (OCaml 5.3 effects,
+same machinery as ConstTrack Phase 3). Neither role is variant-specific
+— the mechanism is a general matching-EDSL facility for any dispatch
+decision. Variant switches are just its first application; AND-patterns
+and literal-match chains are future applications of the same protocol.
+
+**Handler** — installed at IR dispatch nodes (`SwitchE` and friends).
+Sees the IR node and its type. Knows what kind of decision is being
+compiled (variant tag match, integer literal match, record projection,
+tuple component match, …), so it knows which strategies *could* apply
+and what cost model is meaningful for this decision shape. When the
+recognizer asks, the handler computes and returns the chosen strategy.
+
+**Recognizer** — lives inside the matching EDSL — *the procedural
+combinator calls* (`fill_pat`, `compile_pat_local`, `(^^^)`,
+`orElse`, `orsPatternFailure`, …) that together emit Wasm block
+structures for pattern matching. `patternCode` itself is opaque
+(`CannotFail of G.t | CanFail of (G.t -> G.t)`) — no walkable AST
+survives composition — so the recognizer must observe decisions
+*during* emission, not after. At each point where a tag-hash
+comparison (or, later, a literal comparison, component projection,
+…) is about to be emitted, the combinator `perform`s an effect
+— `Match_decision { token; body_compiler; scrutinee_repr }` —
+to the enclosing handler. The handler collects incrementally and,
+when the arm set is complete, commits on a strategy and emits.
+
+The protocol is deliberately generic:
+
+- `token` is opaque to the recognizer — for variants it's a tag
+  hash, for integer literals it's an immediate, for constructor
+  matching it's a nominal ID. The handler interprets it.
+- `body_compiler` is a thunk the handler can invoke (or not) to
+  emit the arm body's Wasm. Giving the handler *control over
+  emission* — rather than just asking it for a strategy to feed
+  back in — is what unlocks AND-patterns later (the handler can
+  return `No_op` and suppress emission entirely when a component
+  is already known from an outer context).
+- `scrutinee_repr` says how to obtain the discriminating value
+  (already loaded on the stack, loadable from a known local, computable
+  by a callback). The handler chooses strategies compatible with that
+  shape.
+
+### Concrete emission points in the EDSL
+
+In `compile_enhanced.ml`, the `perform`-sites are:
+
+- `fill_pat env ae (TagP (l, p))` — surfaces `Match_decision` with
+  `token = hash_variant_label l`, `body_compiler = compile-tail-of-arm`.
+- `fill_pat env ae (AltP (p1, p2))` when both legs are `TagP` or
+  nested `AltP` chains bottoming out in `TagP` — surfaces one
+  `Match_decision` per leaf, sharing the same `body_compiler`
+  (this is what makes or-patterns auto-fold).
+- Later, literal-pattern arms (`LitP`) would surface `Match_decision`
+  with `token = immediate`.
+
+No changes to `(^^^)`, `orElse`, or `orsPatternFailure` — they stay
+pure G.t manipulation. The effects are attached at the *leaf*
+combinators that know what kind of value is being discriminated.
+
+### Strategy for V2 launch
+
+Gosper-based `MaskShift` only — same cost model as V1. The multi-
+strategy batched search (ModPrime, RotLow, …) remains listed below
+as future work; the protocol is forward-compatible with them since
+strategies are plan-values the handler returns, but the V2 milestone
+is purely "make V1's `br_table` dispatch work through the effect
+protocol, so or-patterns get it for free".
+
+### Success criterion
+
+An or-pattern switch must compile to byte-identical Wasm as its
+hand-expanded flat-arm equivalent. Concretely: `isWeekdayOr` (case
+`(#mon | #tue | ... | #fri) true; case _ false`) and `isWeekday`
+(7 flat arms) must disassemble to the same `i32.and; [shr_u];
+br_table` dispatch, with identical mask, shift, table, and arm
+blocks (modulo block-label numbering). A FileCheck test pinning
+this equivalence should live in `test/run/variant_switch.mo` (or
+a sibling) and is the acceptance gate for V2.
+
+### What this buys
+
+- **or-patterns auto-fold.** Distinct arms with identical bodies
+  naturally perform the same payload and the handler sees `k`
+  equivalence classes for free. The "Same-body arm merging" section
+  below becomes automatic under V2.
+- **Generic.** The handler/recognizer split applies to any
+  multi-way decision. Future AND-patterns — where matching a
+  product pattern may want to *skip* components already tested by an
+  outer match — fit the same protocol: the outer handler installs
+  "component `k` is known to be `v`" context, the inner recognizer
+  asks, and the handler returns `No_op` instead of a dispatch plan.
+- **Testable in isolation.** The recognizer can be exercised with a
+  synthetic handler that records the effect trace; the handler can
+  be exercised on hand-built `token_set` inputs without a Wasm
+  backend attached.
+- **Context-sensitive overrides.** Outer handlers (debug flag,
+  size-budget pass, `--preserve-switch-shape` for disassembly) can
+  intercept and force a specific plan without touching call sites.
+
+### Migration path
+
+V1 (IR-level, shipped) stays in place for `compile_classical.ml`. V2
+lands first in `compile_enhanced.ml` where the matching EDSL lives,
+then — if the architecture proves out — backports to the classical
+backend. The `Match_decision` effect surface is narrow enough that
+future handlers at different entry points (AND-pattern compilation,
+literal-match chains, an IR pass that pre-folds or-patterns) can
+slot in without changes at the recognizer side.
+
+## Implementation Steps
+
+### Step 1 — Mask-finding utility (compile time)
+
+Add near the `Variant` module in `compile_classical.ml`:
+
+```ocaml
+(* Returns Some (mask, shift, dispatch_table) or None if too large.
+   dispatch_table.(j) = Some arm_index | None (hole → default). *)
+val find_variant_dispatch : int32 list -> int ->
+  (int32 * int * int option array) option
+```
+
+### Step 2 — New dispatch path in `SwitchE`
+
+Extract `(TagP (lᵢ, _), body_i)` for each case, compute hashes via
+`hash_variant_label env lᵢ`, call `find_variant_dispatch`. If `Some`:
+
+1. Emit `get_tag ^^ compile_unboxed_const M ^^ G.i (Binary (I32 And))`
+2. If S > 0: emit `compile_unboxed_const (Int32.of_int S) ^^ G.i (Binary (I32 ShrU))`
+3. Build `BrTable` target list from dispatch table
+4. Wrap in nested blocks, emit arm bodies with `Br $exit`
+
+If `None`, fall through to `orsPatternFailure`.
+
+### Step 3 — Payload extraction
+
+Call `Variant.project env` at the start of each arm body (same as current
+path) to load the variant payload after dispatch.
+
+### Step 4 — `compile_enhanced.ml` *(TODO)*
+
+Apply the same optimisation to the EOP backend. Check for a parallel
+`SwitchE` handler in `src/codegen/compile_enhanced.ml` and port the helpers
+(`find_variant_mask` etc.) and the new dispatch path there.
+
+Note: the bench tests (`test/bench/`) run under EOP by default — once this
+step is done, those benchmarks will reflect the real instruction savings.
+
+### Step 5 — Tests
+
+- 3-arm, 5-arm, 10-arm, 20-arm variant switches
+- Cases where S > 0 (mask has trailing zeros)
+- Edge case: threshold exceeded → confirm fallback to linear chain
+- Inspect `.wat` output to verify `br_table` + shift are emitted correctly
+
+## Key Files
+
+| File | Role |
+|------|------|
+| `src/codegen/compile_classical.ml` | Main: `SwitchE`, `simplify_cases`, `Variant` module |
+| `src/codegen/compile_enhanced.ml` | Parallel backend, may need same change |
+| `src/wasm-exts/ast.ml` | `BrTable` already defined (line 102) |
+| `test/run/` | New test cases |
+
+## Open Questions / Risks
+
+1. **`FakeMultiVal.block_` interaction.** The multi-value block wrapper
+   tracks nesting depth. The new nested arm blocks must be introduced
+   *inside* `FakeMultiVal.block_` so depth accounting stays correct.
+
+2. **Hash collisions between labels.** Essentially impossible (32-bit hash
+   over distinct strings within one type), but the algorithm degrades
+   gracefully — it simply needs more bits in `M`. Ultimate fallback:
+   `M = ~0l`, `S = 0`, table size = 2^32 → threshold exceeded → linear.
+
+6. **Gosper iteration cutoff.** Without a bound, `iter_masks_with_popcount`
+   may exhaust all C(32,k) candidates (e.g. C(32,4) = 35,960) before
+   returning `None`.  A cutoff of 2^16 iterations per popcount level is
+   applied in both backends to keep compile time bounded.
+
+7. **`None` fallback correctness — implemented.** The `SwitchE` br_table
+   branch builds arm codes using `known_tag_pat` (outer tag check stripped).
+   A `None` return from `find_variant_mask` (whether from the cutoff, no
+   valid mask, or duplicate labels) must fall through to the regular linear
+   `SwitchE` handler — not call `orsPatternFailure` with tag-check-free arms
+   (which would let any arm match any outer tag).
+
+   **Fix:** `find_variant_mask` is called in the `when` guard itself.  If it
+   returns `None`, the guard fails and OCaml's pattern match falls through to
+   the ordinary `SwitchE` handler with full patterns.  The `None` match arm
+   in the body is eliminated entirely.  The distinct-labels workaround is
+   also removed — it is fully subsumed: duplicate labels cause `is_injective`
+   to fail for every mask, so `find_variant_mask` returns `None` and the
+   guard fails safely.
+
+3. **31-bit vs 32-bit hashes — resolved.** `Mo_types.Hash.hash` always
+   returns `logand 0x7fffffff sum`, so bit 31 is never set.  Masks with
+   bit 31 set are irrelevant (would never help distinguish hashes) and
+   must be excluded: `Nat32.of_int32` raises `Invalid_argument` on
+   negative int32 values (bit 31 set).  Fix: change the Gosper loop
+   guard from `!m <> 0l` to `!m > 0l` — stops at zero (wrapped) AND
+   at any mask with bit 31 set.  C(31,k) < 2^16 for k≤4 so this guard
+   is now the binding constraint for small popcount levels.
+
+4. **Threshold tuning.** `max(64, 4n)` is a starting point; may need
+   benchmarking to confirm the right code-size / speed trade-off.
+
+5. **GC forwarding pointers.** `get_variant_tag` already calls
+   `load_forwarding_pointer` — no change needed here.
+
+## Future Optimisation: Same-body arm merging
+
+### Scope decision (2026-04-23)
+
+Under V2, or-patterns (`case (#mon or #tue or … or #fri) false`) already
+collapse to one arm block via the recognizer's `flatten_tag_leaves`
+helper — all leaves of a single case contribute multiple entries to the
+same sub-list in `Dispatch.Query`. That's the incentive channel: users
+who recognize same-body structure and write an or-pattern get the
+benefit.
+
+**We deliberately do NOT auto-merge arms with structurally-equal bodies
+across distinct cases.** The reasoning:
+
+- Leaving cross-case merging to the user incentivises writing or-patterns,
+  which communicate intent (this group of tags genuinely shares an
+  outcome). Syntactic or-patterns are also stable under refactoring in a
+  way that "two cases happen to produce identical IR" is not.
+- Auto-merging at the IR level is brittle: two arms with the same *IR*
+  expression may have different *typing contexts*, scope, or side-effect
+  behaviour in edge cases that an equality check might miss.
+- More importantly, the right equivalence is **Wasm-instruction-sequence
+  equality**, not IR equality (see refinement below) — and that requires
+  ahead-of-time arm compilation, which is a larger restructuring.
+
+### Why same-body merging matters for strategy choice
+
+Same-body merging is **not** a speedup for the already-dispatched case
+— each br_table slot still lands in its own arm block that executes the
+same instructions regardless of whether the blocks are physically
+shared. It's a code-size saving of that one duplicated body-block.
+
+But it **does** affect dispatcher choice upstream. The handler's
+strategy space is parameterised by `N = number of distinct outcome
+classes`, not by the raw arm count:
+
+- `ModPrime`: smaller `p` when classes merge — the br_table shrinks
+  linearly with class count. `mod 3` for 3 classes vs `mod 7` for 7
+  (same per-op cost, but half the table bytes).
+- `MaskShift`: fewer injectivity constraints → Gosper has more masks to
+  pick from → may land on smaller popcount (fewer SHR bits, more compact
+  table).
+- Perfect hashing: easier to find for 3 tokens than 7.
+
+So merging propagates as input-size reduction through every strategy
+downstream. Or-patterns already get this benefit because they arrive
+pre-merged. Cross-case merging would extend it — but only where the
+user didn't already write the or-pattern themselves.
+
+### Refinement: Wasm-level equivalence classes
+
+If we later add cross-case merging, the equivalence criterion should be
+**raw Wasm instruction sequences**, not IR structural equality:
+
+1. Compile each arm's body ahead of choosing the dispatch strategy
+   (remember: each arm is already a `Block` internally, and composition
+   via `(^^^)` is just difference-list concatenation of `G.t`).
+2. Compute an equivalence class per arm by comparing the compiled Wasm
+   byte sequences (modulo block-label numbering).
+3. Pass the class-reduced `token_set` to `Dispatch.Query` — the handler
+   sees `k ≤ n` sub-lists.
+
+Advantages over IR-level merging:
+- Blind to IR phase-ordering decisions (constant folding, inlining,
+  ANF) that might make two semantically-identical arms look different
+  at IR level.
+- Catches arms that *compile to* identical bytes for reasons the IR
+  doesn't surface (e.g. two variant-projection patterns that happen to
+  emit the same offset load).
+- No coupling to Motoko-specific expression-equality judgement —
+  works verbatim for any future matching context.
+
+### Incremental path
+
+1. V2 as-is: or-pattern merging only. Ships as part of #5927.
+2. Later: ahead-of-time arm compilation in the recognizer, Wasm-bytes
+   comparison, cross-case merging. Handler protocol is already
+   compatible (token_set is a list-of-lists); this is a pure
+   recognizer-side extension.
+
+*Not yet implemented — tracked here for future work.*
+
+## Future Optimisation: Multi-strategy batched search
+
+The current single Gosper stream can still be slow for unlucky hash sets
+(e.g. the 12-arm NNS `Action_` type needs ~8 000 iterations).  The fix is
+to run several independent strategy streams **concurrently** and take the
+best early result — whichever strategy happens to work cheaply for the
+given hashes terminates the search.
+
+### Candidate type
+
+```ocaml
+type candidate =
+  | MaskShift of { mask : int32; shift : int; table_size : int }
+      (* runtime: (hash & mask) >> shift;  overhead: AND + opt. SHR *)
+  | ModPrime  of { prime : int; table_size : int }
+      (* runtime: hash % prime;            overhead: i32.rem_u       *)
+  | RotLow    of { rot : int; bits : int; table_size : int }
+      (* runtime: rotl32(hash, rot) & (2^bits-1); overhead: i32.rotl + AND *)
+```
+
+### Cost model
+
+```
+cost(MaskShift{shift; table_size}) = 2 + (if shift > 0 then 1 else 0)
+                                       + table_size   (* br_table entries *)
+cost(ModPrime{prime})              = 3 + prime        (* rem_u costs ~3 cycles *)
+cost(RotLow{rot; bits})            = 2 + (if rot > 0 then 1 else 0)
+                                       + 1 lsl bits
+```
+
+Rank by estimated cycle cost; lower is better.
+
+### Strategy generators (lazy streams)
+
+1. **MaskShift (batched Gosper)**: For each bit-position window
+   `[2^n, 2^(n+1))` in increasing `n`, run Gosper within that window
+   with `popcount = bits_needed(arms)`, then `popcount+1`, etc.  This
+   avoids charging to high-valued masks before exhausting low-bit windows.
+
+2. **ModPrime**: Iterate primes `p ≥ n` in increasing order; for each
+   check `hᵢ mod p` pairwise distinct.  Terminates quickly when a small
+   collision-free prime exists.
+
+3. **RotLow**: For `bits = bits_needed(n)` and `rot = 0..30`, check
+   `rotl32(hᵢ, rot) & (2^bits - 1)` injective.  31 candidates per
+   `bits` level; try `bits+1` if none work.
+
+### Merger
+
+Run all generators round-robin (or priority-queue ordered by emitted cost).
+Collect the first few injective candidates and emit the cheapest.  A simple
+scheme: advance each generator one step per round until the first result
+appears in any stream; collect a bounded window of results (e.g. 4) across
+all streams; return the cheapest.
+
+### Benefits
+
+- No single strategy's worst case dominates compile time.
+- Prime-based dispatch is tried immediately and wins when a small prime
+  gives a tiny table (common for small variant types).
+- Rotation-based dispatch is tried with only 31 probes and guarantees
+  table size ≤ 2^k — useful when Gosper + mod-prime both fail cheaply.
+- The current 2^16 cutoff per strategy stream remains as a hard cap but
+  is rarely hit because one stream produces an early winner.
+
+### Interaction with same-body merging
+
+Apply the merged search after grouping arms by body (use equivalence-class
+count as the effective `n`).
+
+*Not yet implemented — tracked here for future work.*
+
+## Future Optimisation: Pre-shortening before Gosper's iteration
+
+*(Subsumed by Multi-strategy search above — ModPrime and RotLow are the
+concrete pre-shortening strategies described there.)*
+
+## Non-goals
+
+- Nested/wildcard patterns in variant arms (handled by `compile_pat_local`)
+- `switch` on non-variant types
+- JS backend (`moc.js`)
diff --git a/src/codegen/compile_classical.ml b/src/codegen/compile_classical.ml
index 563bef527e5..b084e038988 100644
--- a/src/codegen/compile_classical.ml
+++ b/src/codegen/compile_classical.ml
@@ -11170,6 +11170,74 @@ let compile_load_field env typ name =
   Object.load_idx env typ name
 
 
+(* ===== Variant switch: masked br_table dispatch =====
+   See .claude/plans/variant-switch-br-table.md for full design notes. *)
+
+(* ceil(log₂ n) — minimum bits needed to index n distinct values *)
+let bits_needed n =
+  let rec f k = if 1 lsl k >= n then k else f (k + 1) in
+  if n <= 1 then 0 else f 1
+
+(* Iterate all int32 bit-masks with exactly k bits set, in increasing
+   numerical order (Gosper's hack).  Calls [f m] for each; stops early
+   if [f] returns true. *)
+let iter_masks_with_popcount k f =
+  if k <= 0 || k > 32 then ()
+  else
+    (* smallest k-bit mask: bits 0..k-1 set *)
+    let m = ref (Int32.sub (Int32.shift_left 1l k) 1l) in
+    let stop = ref false in
+    let iters = ref 0 in
+    while not !stop && !m > 0l && !iters < 0x10000 do  (* 0l == wrapped; <0l == bit31 set (hashes are 31-bit) *)
+      incr iters;
+      if f !m then stop := true
+      else begin
+        (* Gosper's hack: next int32 with same popcount *)
+        let n = !m in
+        let c = Int32.logand n (Int32.neg n) in   (* lowest set bit *)
+        let r = Int32.add n c in                   (* clear run, advance *)
+        m := Int32.logor r
+               (Int32.shift_right_logical
+                  (Int32.div (Int32.logxor r n) c) 2)
+      end
+    done
+
+(* True iff all (h & mask) values in [hashes] are distinct *)
+let is_injective mask hashes =
+  let masked = List.map (Int32.logand mask) hashes in
+  List.length masked = List.length (List.sort_uniq Int32.compare masked)
+
+(* Table size after applying mask+shift: max((hᵢ & mask) >> shift) + 1 *)
+let compact_table_size mask shift hashes =
+  List.fold_left (fun acc h ->
+    max acc (Int32.to_int (Int32.shift_right_logical (Int32.logand h mask) shift))
+  ) 0 hashes + 1
+
+(* Find mask M and shift S for an n-arm variant switch.
+   Returns Some (mask, shift, table_size) or None if no suitable mask found.
+   Iterates masks of minimal popcount first (smallest integer = low bits =
+   compact table), accepting the first that is injective and within threshold. *)
+let find_variant_mask n hashes =
+  let threshold = max 64 (4 * n) in
+  let rec try_popcount req =
+    if req > 8 then None
+    else
+      let result = ref None in
+      iter_masks_with_popcount req (fun mask ->
+        if is_injective mask hashes then begin
+          let shift = Int32.to_int Numerics.Nat32.(of_int32 mask |> ctz |> to_int32) in
+          let tbl = compact_table_size mask shift hashes in
+          if tbl <= threshold then begin
+            result := Some (mask, shift, tbl); true   (* stop *)
+          end else false
+        end else false
+      );
+      match !result with
+      | Some _ as r -> r
+      | None -> try_popcount (req + 1)
+  in
+  try_popcount (bits_needed n)
+
 (* compile_lexp is used for expressions on the left of an assignment operator.
    Produces
    * preparation code, to run first
@@ -12776,6 +12844,20 @@ and single_case e (cs : Ir.case list) =
 
 and known_tag_pat p = TagP ("", p)
 
+(* Flatten a pattern into the list of tag-label leaves it matches, or None if
+   any leaf is not a non-empty TagP. AltP trees are collapsed left-to-right.
+   This turns `(#mon | #tue | ... | #fri)` into `[("mon", _); ("tue", _); ...]`
+   so the SwitchE handler can count it as 5 dispatch targets for mask-finding,
+   while still compiling the arm body just once. *)
+and flatten_tag_leaves (p : Ir.pat) : (string * Ir.pat) list option =
+  match p.it with
+  | TagP (l, sub) when l <> "" -> Some [(l, sub)]
+  | AltP (p1, p2) ->
+    (match flatten_tag_leaves p1, flatten_tag_leaves p2 with
+     | Some l1, Some l2 -> Some (l1 @ l2)
+     | _ -> None)
+  | _ -> None
+
 and simplify_cases e (cs : Ir.case list) =
   match cs, e.note.Note.typ with
   (* for a 2-cased variant type, the second comparison can be omitted when the first pattern
@@ -12873,6 +12955,92 @@ and compile_exp_with_hint (env : E.t) ae sr_hint exp =
        G.i Unreachable (* We should always exit using the branch_code *)
     )
 
+  (* Variant switch with 4+ tag leaves (post or-pattern flattening):
+     use masked br_table dispatch (O(1)).
+     Guard pre-checks find_variant_mask so that None falls through naturally
+     to the regular SwitchE handler below (no broken known_tag_pat fallback).
+     Or-patterns like `(#mon | #tue | ... | #fri) true` count as multiple
+     dispatch targets (one per leaf) but compile to a single arm body — the
+     dispatch table has multiple slots pointing to the same arm block. *)
+  | SwitchE (e, cs) when
+      (let per_case = List.map (fun {it=({pat; _} : case'); _} -> flatten_tag_leaves pat) cs in
+       List.for_all Option.is_some per_case &&
+       let all_leaves = List.concat_map Option.get per_case in
+       let n = List.length all_leaves in
+       n >= 4 &&
+       let hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) all_leaves in
+       find_variant_mask n hashes <> None) ->
+    let code1 = compile_exp_vanilla env ae e in
+    let (set_i, get_i) = new_local env "switch_in" in
+
+    (* Per case: compile the body once (using the first leaf's sub-pattern
+       — Motoko or-pattern typing ensures all legs bind the same variables).
+       Record the set of leaf hashes that dispatch to this case. *)
+    let cases = List.map (fun {it={pat; exp=arm_exp}; _} ->
+      let [@warning "-8"] Some leaves = flatten_tag_leaves pat in
+      let [@warning "-8"] (_, first_sub_pat) :: _ = leaves in
+      let ae1, pat_code = compile_pat_local env ae {pat with it = known_tag_pat first_sub_pat} in
+      let sr, rhs_code = compile_exp_with_hint env ae1 sr_hint arm_exp in
+      let leaf_hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) leaves in
+      (leaf_hashes, sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code)
+    ) cs in
+
+    let n_cases = List.length cases in
+    let all_hashes = List.concat_map (fun (hs, _, _) -> hs) cases in
+    let n_leaves = List.length all_hashes in
+
+    let final_sr = match sr_hint with
+      | Some sr -> sr
+      | None -> StackRep.joins (List.map (fun (_, sr, _) -> sr) cases)
+    in
+
+    let [@warning "-8"] Some (mask, shift, table_size) = find_variant_mask n_leaves all_hashes in
+    (* Build dispatch table: slot j -> case index (0..n_cases-1) or n_cases (default).
+       Every leaf of case k contributes one slot pointing to k. *)
+    let slot_for_case = Array.make table_size n_cases in
+    List.iteri (fun case_idx (leaf_hashes, _, _) ->
+      List.iter (fun hash ->
+        let slot = Int32.to_int
+          (Int32.shift_right_logical (Int32.logand hash mask) shift) in
+        slot_for_case.(slot) <- case_idx
+      ) leaf_hashes
+    ) cases;
+
+    final_sr,
+    code1 ^^ set_i ^^
+    FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code ->
+
+      (* Dispatch code: load tag, mask, optional shift, br_table *)
+      let dispatch =
+        get_i ^^
+        Variant.get_variant_tag env ^^
+        compile_bitand_const mask ^^
+        (if shift > 0 then compile_shrU_const (Int32.of_int shift) else G.nop) ^^
+        G.i (BrTable (
+          List.init table_size (fun j -> nr (Int32.of_int slot_for_case.(j))),
+          nr (Int32.of_int n_cases)  (* default: unreachable *)
+        ))
+      in
+
+      (* Arm body codes: sub-pattern match + rhs + SR-adjust + exit.
+         On sub-pattern failure (impossible for well-typed code): trap. *)
+      let arm_body_codes = List.map (fun (_, sr, c) ->
+        with_fail (G.i Unreachable)
+          (c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code))
+      ) cases in
+
+      (* Build nested blocks from inside out:
+           block_default { block_case_{k-1} { ... block_case_0 { dispatch }
+             body_0 ... } body_{k-1} } unreachable
+         Inside dispatch: label k -> case k, label n_cases -> default.
+         fold starts with dispatch (not an extra wrapper), so case_0 is label 0. *)
+      let with_arms = List.fold_left (fun acc body_code ->
+        G.block0 acc ^^ body_code
+      ) dispatch arm_body_codes in
+      G.block0 with_arms ^^
+      G.i Unreachable
+    )
+
   | SwitchE (e, cs) ->
     let code1 = compile_exp_vanilla env ae e in
     let (set_i, get_i) = new_local env "switch_in" in
diff --git a/src/codegen/compile_enhanced.ml b/src/codegen/compile_enhanced.ml
index 269e355b340..30739dcc39b 100644
--- a/src/codegen/compile_enhanced.ml
+++ b/src/codegen/compile_enhanced.ml
@@ -11558,6 +11558,232 @@ let compile_load_field env typ name =
   Object.load_idx env typ name
 
 
+(* ===== Variant switch: masked br_table dispatch =====
+   See .claude/plans/variant-switch-br-table.md for full design notes.
+   NB: EOP backend uses int64 hashes throughout (unlike classical int32). *)
+
+(* ceil(log₂ n) — minimum bits needed to index n distinct values *)
+let bits_needed n =
+  let rec f k = if 1 lsl k >= n then k else f (k + 1) in
+  if n <= 1 then 0 else f 1
+
+(* Iterate all int64 bit-masks with exactly k bits set, in increasing
+   numerical order (Gosper's hack).  Calls [f m] for each; stops early
+   if [f] returns true. *)
+let iter_masks_with_popcount k f =
+  if k <= 0 || k > 32 then ()   (* EOP hashes are extend_i32_u: bits 0-31 only *)
+  else
+    (* smallest k-bit mask: bits 0..k-1 set *)
+    let m = ref (Int64.sub (Int64.shift_left 1L k) 1L) in
+    let stop = ref false in
+    let iters = ref 0 in
+    while not !stop && !m <> 0L && Int64.compare !m 0x1_0000_0000L < 0 && !iters < 0x10000 do
+      incr iters;
+      if f !m then stop := true
+      else begin
+        (* Gosper's hack: next int64 with same popcount *)
+        let n = !m in
+        let c = Int64.logand n (Int64.neg n) in   (* lowest set bit *)
+        let r = Int64.add n c in                   (* clear run, advance *)
+        m := Int64.logor r
+               (Int64.shift_right_logical
+                  (Int64.div (Int64.logxor r n) c) 2)
+      end
+    done
+
+(* True iff all (h & mask) values in [hashes] are distinct *)
+let is_injective mask hashes =
+  let masked = List.map (Int64.logand mask) hashes in
+  List.length masked = List.length (List.sort_uniq Int64.compare masked)
+
+(* Table size after applying mask+shift: max((hᵢ & mask) >> shift) + 1 *)
+let compact_table_size mask shift hashes =
+  List.fold_left (fun acc h ->
+    max acc (Int64.to_int (Int64.shift_right_logical (Int64.logand h mask) shift))
+  ) 0 hashes + 1
+
+(* Count trailing zeros of a non-zero int64 (works for any bit pattern). *)
+let ctz64 m =
+  let c = ref (Int64.logand m (Int64.neg m)) in   (* isolate lowest set bit *)
+  let s = ref 0 in
+  while !c <> 1L do c := Int64.shift_right_logical !c 1; incr s done;
+  !s
+
+(* Find mask M and shift S for an n-arm variant switch.
+   Returns Some (mask, shift, table_size) or None if no suitable mask found.
+   Iterates masks of minimal popcount first (smallest integer = low bits =
+   compact table), accepting the first that is injective and within threshold. *)
+let find_variant_mask n hashes =
+  let threshold = max 64 (4 * n) in
+  let rec try_popcount req =
+    if req > 8 then None
+    else
+      let result = ref None in
+      iter_masks_with_popcount req (fun mask ->
+        if is_injective mask hashes then begin
+          let shift = ctz64 mask in
+          let tbl = compact_table_size mask shift hashes in
+          if tbl <= threshold then begin
+            result := Some (mask, shift, tbl); true   (* stop *)
+          end else false
+        end else false
+      );
+      match !result with
+      | Some _ as r -> r
+      | None -> try_popcount (req + 1)
+  in
+  try_popcount (bits_needed n)
+
+(* Dispatch strategy protocol (effect-based, streaming).
+
+   Recognizer — the code that walks a multi-way match (today: the
+   SwitchE variant case; tomorrow: and-patterns, literal-match chains)
+   — streams one case at a time via `Match_arm`, then signals
+   completion with `Match_join` to receive the strategy plan. Each
+   `Match_arm hashes` hands the handler one case's token set (or-pattern
+   legs contribute multiple hashes in a single `Match_arm` call).
+
+   This streaming shape is cheap ceremony for whole-switch recognizers
+   like SwitchE (which already knows all cases upfront), but leaves
+   room for future recognizers that surface decisions incrementally —
+   AND-pattern sub-components, nested literal chains, etc. — without
+   protocol churn.
+
+   Handler — maintains mutable state across `Match_arm` calls and
+   returns a `plan` at `Match_join`. The default handler runs
+   Gosper-based mask-finding. Outer scopes may override by installing
+   their own handler before the recognizer fires (e.g. a test wanting
+   to pin a specific plan, or a `--preserve-switch-shape` debug flag
+   that forces `Linear`). *)
+module Dispatch = struct
+  type plan =
+    | MaskShift of {
+        mask : int64;
+        shift : int;
+        table_size : int;
+        (* For each slot j in the br_table, which case index receives it.
+           Slot value `n_cases` means "default" (unreachable for
+           exhaustive variant matches). *)
+        slot_for_case : int array;
+      }
+    | ModPrime of {
+        p : int;
+        (* For each residue r in 0..p-1, which case index receives it. *)
+        case_for_residue : int array;
+      }
+    | Linear   (* fall back to the default linear-chain SwitchE emission *)
+
+  type _ Effect.t +=
+    | Match_arm : int64 list -> unit Effect.t
+    | Match_join : plan Effect.t
+
+  let gosper_plan (case_hashes : int64 list list) : plan =
+    let flat = List.concat case_hashes in
+    let n = List.length flat in
+    let n_cases = List.length case_hashes in
+    if n < 4 then Linear
+    else match find_variant_mask n flat with
+      | None -> Linear
+      | Some (mask, shift, table_size) ->
+        let slot_for_case = Array.make table_size n_cases in
+        List.iteri (fun case_idx hashes ->
+          List.iter (fun hash ->
+            let slot = Int64.to_int
+              (Int64.shift_right_logical (Int64.logand hash mask) shift) in
+            slot_for_case.(slot) <- case_idx
+          ) hashes
+        ) case_hashes;
+        MaskShift { mask; shift; table_size; slot_for_case }
+
+  (* Try `hash mod p` for small primes p ≥ n_cases. Succeeds when all
+     leaf hashes of the same case share a residue AND different cases
+     land on different residues — i.e. mod p naturally clusters or-
+     pattern branches together into the right case. The br_table is
+     then tiny: exactly p slots. Primes are tried smallest-first so
+     the emitted table is as compact as possible.
+
+     Cost trade-off vs MaskShift: `rem_u` is slower than `and`+`shr_u`
+     on most Wasm engines (engine-dependent; roughly 5–20× per op),
+     but ModPrime's table can be drastically smaller when or-patterns
+     reduce the effective continuation count. For tiny c (say c ≤ 8)
+     the size win is real; for larger c MaskShift wins. *)
+  let modprime_plan (case_hashes : int64 list list) : plan option =
+    let n_cases = List.length case_hashes in
+    let candidates = [2; 3; 5; 7; 11; 13; 17; 19; 23; 29; 31] in
+    let try_prime p =
+      if p < n_cases then None
+      else
+        let case_for_residue = Array.make p n_cases in
+        let ok = ref true in
+        List.iteri (fun case_idx hashes ->
+          if !ok then
+            List.iter (fun hash ->
+              let r = Int64.to_int (Int64.rem hash (Int64.of_int p)) in
+              if case_for_residue.(r) = n_cases then
+                case_for_residue.(r) <- case_idx
+              else if case_for_residue.(r) <> case_idx then
+                ok := false
+            ) hashes
+        ) case_hashes;
+        if !ok then Some (ModPrime { p; case_for_residue })
+        else None
+    in
+    List.find_map try_prime candidates
+
+  (* Pick a plan.
+
+     This policy is deliberately ad-hoc: its only job is to demonstrate
+     that the `Dispatch` protocol can pick *different* strategies for
+     or-patterns vs flat-arm expansions of the same switch, producing
+     measurably different cycle counts. It is NOT tuned for real
+     workloads yet — ModPrime's `rem_u` is more expensive than
+     MaskShift's `and`+`shr_u` under the ICP cycle model, so routing
+     or-patterns to ModPrime makes them *slower* than the flat form.
+
+     Why keep it anyway: the split is visible in the benchmark
+     (`startLetterOr` > `startLetter` after this patch), which proves
+     the mechanism branches on `c < n`. A follow-up commit can replace
+     this with a smarter policy (e.g. case-aware Gosper that finds
+     smaller masks when or-patterns allow same-case hashes to share a
+     slot) without touching the protocol or the emitter.
+
+     Concretely:
+       - `n` = total leaves across all cases
+       - `c` = number of cases (distinct continuations)
+       - `c < n` ↔ at least one case has an or-pattern
+
+     When `c < n` we route through ModPrime (fallback to MaskShift if
+     no prime works); when `c = n` we take the MaskShift fast path. *)
+  let choose_plan (case_hashes : int64 list list) : plan =
+    let c = List.length case_hashes in
+    let n = List.length (List.concat case_hashes) in
+    if n < 4 then Linear
+    else if c < n then
+      match modprime_plan case_hashes with
+      | Some p -> p
+      | None -> gosper_plan case_hashes
+    else
+      gosper_plan case_hashes
+
+  (* Install the default handler around `body`. Arms submitted via
+     `Match_arm` accumulate in reverse; `Match_join` reverses once and
+     commits a plan. Each `with_handler` scope has its own accumulator
+     — nested switches don't leak state. *)
+  let with_handler (type r) (body : unit -> r) : r =
+    let arms_rev = ref [] in
+    let effc : type a. a Effect.t -> ((a, r) Effect.Deep.continuation -> r) option = function
+      | Match_arm hashes ->
+        Some (fun k ->
+          arms_rev := hashes :: !arms_rev;
+          Effect.Deep.continue k ())
+      | Match_join ->
+        Some (fun k ->
+          Effect.Deep.continue k (choose_plan (List.rev !arms_rev)))
+      | _ -> None
+    in
+    Effect.Deep.try_with body () { Effect.Deep.effc = effc }
+end
+
 (* compile_lexp is used for expressions on the left of an assignment operator.
    Produces
    * preparation code, to run first
@@ -13100,6 +13326,20 @@ and single_case e (cs : Ir.case list) =
 
 and known_tag_pat p = TagP ("", p)
 
+(* Flatten a pattern into the list of tag-label leaves it matches, or None if
+   any leaf is not a non-empty TagP. AltP trees are collapsed left-to-right.
+   This turns `(#mon | #tue | ... | #fri)` into `[("mon", _); ("tue", _); ...]`
+   so the SwitchE handler can count it as 5 dispatch targets for mask-finding,
+   while still compiling the arm body just once. *)
+and flatten_tag_leaves (p : Ir.pat) : (string * Ir.pat) list option =
+  match p.it with
+  | TagP (l, sub) when l <> "" -> Some [(l, sub)]
+  | AltP (p1, p2) ->
+    (match flatten_tag_leaves p1, flatten_tag_leaves p2 with
+     | Some l1, Some l2 -> Some (l1 @ l2)
+     | _ -> None)
+  | _ -> None
+
 and simplify_cases e (cs : Ir.case list) =
   match cs, e.note.Note.typ with
   (* for a 2-cased variant type, the second comparison can be omitted when the first pattern
@@ -13201,29 +13441,116 @@ and compile_exp_with_hint (env : E.t) ae sr_hint exp =
     let code1 = compile_exp_vanilla env ae e in
     let (set_i, get_i) = new_local env "switch_in" in
 
-    (* compile subexpressions and collect the provided stack reps *)
-    let codes = List.map (fun {it={pat; exp=e}; _} ->
-      let (ae1, pat_code) = compile_pat_local env ae pat in
-      let (sr, rhs_code) = compile_exp_with_hint env ae1 sr_hint e in
-      (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code)
+    (* Recognizer: if every case is tag-leafy (`TagP` possibly wrapped in
+       nested `AltP` legs), stream each case's leaf hashes to the
+       `Dispatch` handler via `Match_arm`, then request the plan via
+       `Match_join`. Or-patterns contribute multiple hashes in one
+       `Match_arm` but compile to a single arm body. A `MaskShift` plan
+       fires br_table dispatch; a `Linear` plan or a non-tag-leafy case
+       falls through to the linear-chain emission below. *)
+    let maybe_plan : Dispatch.plan option =
+      if List.for_all (fun {it=({pat; _} : case'); _} ->
+           Option.is_some (flatten_tag_leaves pat)) cs then
+        Some (Dispatch.with_handler (fun () ->
+          List.iter (fun {it=({pat; _} : case'); _} ->
+            let [@warning "-8"] Some leaves = flatten_tag_leaves pat in
+            let hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) leaves in
+            Effect.perform (Dispatch.Match_arm hashes)
+          ) cs;
+          Effect.perform Dispatch.Match_join))
+      else None
+    in
+
+    (match maybe_plan with
+    | Some ((Dispatch.MaskShift _ | Dispatch.ModPrime _) as plan) ->
+      (* Per case: compile the body once (using the first leaf's sub-pattern
+         — Motoko or-pattern typing ensures all legs bind the same variables). *)
+      let cases = List.map (fun {it={pat; exp=arm_exp}; _} ->
+        let [@warning "-8"] Some leaves = flatten_tag_leaves pat in
+        let [@warning "-8"] (_, first_sub_pat) :: _ = leaves in
+        let ae1, pat_code = compile_pat_local env ae {pat with it = known_tag_pat first_sub_pat} in
+        let sr, rhs_code = compile_exp_with_hint env ae1 sr_hint arm_exp in
+        (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code)
+      ) cs in
+
+      let n_cases = List.length cases in
+
+      let final_sr = match sr_hint with
+        | Some sr -> sr
+        | None -> StackRep.joins (List.map fst cases)
+      in
+
+      (* Build the dispatch prologue + br_table per plan. Both strategies
+         arrive at the same (case_idx, default) br_table interface — only
+         the index-computation prologue differs. *)
+      let dispatch_code =
+        let prologue_plus_table =
+          match plan with
+          | Dispatch.MaskShift { mask; shift; table_size; slot_for_case } ->
+            compile_bitand_const mask ^^
+            (if shift > 0 then compile_shrU_const (Int64.of_int shift) else G.nop) ^^
+            G.i (Convert (Wasm_exts.Values.I32 I32Op.WrapI64)) ^^
+            G.i (BrTable (
+              List.init table_size (fun j -> nr (Int32.of_int slot_for_case.(j))),
+              nr (Int32.of_int n_cases)  (* default: unreachable *)
+            ))
+          | Dispatch.ModPrime { p; case_for_residue } ->
+            compile_unboxed_const (Int64.of_int p) ^^
+            G.i (Binary (Wasm_exts.Values.I64 I64Op.RemU)) ^^
+            G.i (Convert (Wasm_exts.Values.I32 I32Op.WrapI64)) ^^
+            G.i (BrTable (
+              List.init p (fun r -> nr (Int32.of_int case_for_residue.(r))),
+              nr (Int32.of_int n_cases)
+            ))
+          | Dispatch.Linear -> assert false
+        in
+        get_i ^^ Variant.get_variant_tag env ^^ prologue_plus_table
+      in
+
+      final_sr,
+      code1 ^^ set_i ^^
+      FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code ->
+
+        (* Arm body codes: sub-pattern match + rhs + SR-adjust + exit.
+           On sub-pattern failure (impossible for well-typed code): trap. *)
+        let arm_body_codes = List.map (fun (sr, c) ->
+          with_fail (G.i Unreachable)
+            (c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code))
+        ) cases in
+
+        (* Build nested blocks from inside out:
+             block_default { block_case_{k-1} { ... block_case_0 { dispatch }
+               body_0 ... } body_{k-1} } unreachable
+           Inside dispatch: label k -> case k, label n_cases -> default.
+           fold starts with dispatch (not an extra wrapper), so case_0 is label 0. *)
+        let with_arms = List.fold_left (fun acc body_code ->
+          G.block0 acc ^^ body_code
+        ) dispatch_code arm_body_codes in
+        G.block0 with_arms ^^
+        G.i Unreachable
+      )
+
+    | Some Dispatch.Linear | None ->
+      (* Linear hash-compare chain (default). *)
+      let codes = List.map (fun {it={pat; exp=e}; _} ->
+        let (ae1, pat_code) = compile_pat_local env ae pat in
+        let (sr, rhs_code) = compile_exp_with_hint env ae1 sr_hint e in
+        (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code)
       ) (simplify_cases e cs) in
 
-    (* Use the expected stackrep, if given, else infer from the branches *)
-    let final_sr = match sr_hint with
-      | Some sr -> sr
-      | None -> StackRep.joins (List.map fst codes)
-    in
+      let final_sr = match sr_hint with
+        | Some sr -> sr
+        | None -> StackRep.joins (List.map fst codes)
+      in
 
-    final_sr,
-    (* Run scrut *)
-    code1 ^^ set_i ^^
-    (* Run rest in block to exit from *)
-    FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code ->
-       orsPatternFailure env (List.map (fun (sr, c) ->
-          c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code)
-       ) codes) ^^
-       G.i Unreachable (* We should always exit using the branch_code *)
-    )
+      final_sr,
+      code1 ^^ set_i ^^
+      FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code ->
+         orsPatternFailure env (List.map (fun (sr, c) ->
+            c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code)
+         ) codes) ^^
+         G.i Unreachable (* We should always exit using the branch_code *)
+      ))
   (* Async-wait lowering support features *)
   | DeclareE (name, typ, e) ->
     let ae1, i = VarEnv.add_local_with_heap_ind env ae name typ in
diff --git a/test/bench/ok/variant-switch.drun-run.ok b/test/bench/ok/variant-switch.drun-run.ok
index dfa3aef2a7c..9ded029edea 100644
--- a/test/bench/ok/variant-switch.drun-run.ok
+++ b/test/bench/ok/variant-switch.drun-run.ok
@@ -1,9 +1,9 @@
 ingress Completed: Reply: 0x4449444c016c01b3c4b1f204680100010a00000000000000000101
 ingress Completed: Reply: 0x4449444c0000
-debug.print: {heap_diff = 0; instr_diff = 137_590_321; total = 1_090_000}
+debug.print: {heap_diff = 0; instr_diff = 95_690_321; total = 1_090_000}
 ingress Completed: Reply: 0x4449444c0000
-debug.print: {fib7_eval = 13; fib7_evalFT = 13; fib7_xform = 13; instr_eval = 24_509_348; instr_evalFT = 21_536_248; instr_transform = 1_189_148}
+debug.print: {fib7_eval = 13; fib7_evalFT = 13; fib7_xform = 13; instr_eval = 22_051_348; instr_evalFT = 21_536_248; instr_transform = 1_057_948}
 ingress Completed: Reply: 0x4449444c0000
-debug.print: {acc1 = 20_000; acc2 = 20_000; instr_isWeekend = 10_010_321; instr_isWeekendOr = 11_050_321}
+debug.print: {acc1 = 20_000; acc2 = 20_000; instr_isWeekend = 7_070_321; instr_isWeekendOr = 7_560_321; instr_startLetter = 6_270_321; instr_startLetterOr = 6_760_321; last3 = 'S'; last4 = 'S'}
 ingress Completed: Reply: 0x4449444c0000
 ingress Completed: Reply: 0x4449444c0000
diff --git a/test/bench/variant-switch.mo b/test/bench/variant-switch.mo
index 9e5a935cae1..2e2a8b07ac2 100644
--- a/test/bench/variant-switch.mo
+++ b/test/bench/variant-switch.mo
@@ -120,6 +120,30 @@ persistent actor Core {
       case (#Sat or #Sun) true;
     };
 
+  // 7-way distinct-outcome dispatch (richer than the 2-way isWeekend Bool).
+  // Bodies include two natural same-body groups: {Tue, Thu} → 'T' and
+  // {Sat, Sun} → 'S'. Useful to compare flat-arm vs or-pattern compilation
+  // when the payload is varied rather than collapsed to a Bool.
+  func startLetter(d : Weekday) : Char =
+    switch d {
+      case (#Mon) 'M';
+      case (#Tue) 'T';
+      case (#Wed) 'W';
+      case (#Thu) 'T';
+      case (#Fri) 'F';
+      case (#Sat) 'S';
+      case (#Sun) 'S';
+    };
+
+  func startLetterOr(d : Weekday) : Char =
+    switch d {
+      case (#Mon) 'M';
+      case (#Tue or #Thu) 'T';
+      case (#Wed) 'W';
+      case (#Fri) 'F';
+      case (#Sat or #Sun) 'S';
+    };
+
   transient let week : [Weekday] =
     [#Mon, #Tue, #Wed, #Thu, #Fri, #Sat, #Sun];
 
@@ -326,10 +350,26 @@ persistent actor Core {
       j += 1;
     };
     let (_m2, n2) = counters();
+    var last3 : Char = 'X';
+    var k = 0;
+    while (k < 10_000) {
+      for (d in week.vals()) { last3 := startLetter d };
+      k += 1;
+    };
+    let (_m3, n3) = counters();
+    var last4 : Char = 'X';
+    var l = 0;
+    while (l < 10_000) {
+      for (d in week.vals()) { last4 := startLetterOr d };
+      l += 1;
+    };
+    let (_m4, n4) = counters();
     debugPrint(debug_show {
-      acc1; acc2;
-      instr_isWeekend   = n1 - n0;
-      instr_isWeekendOr = n2 - n1;
+      acc1; acc2; last3; last4;
+      instr_isWeekend      = n1 - n0;
+      instr_isWeekendOr    = n2 - n1;
+      instr_startLetter    = n3 - n2;
+      instr_startLetterOr  = n4 - n3;
     });
   };
 };
diff --git a/test/run/variant_switch.mo b/test/run/variant_switch.mo
new file mode 100644
index 00000000000..e6f9e360323
--- /dev/null
+++ b/test/run/variant_switch.mo
@@ -0,0 +1,60 @@
+// Test masked br_table dispatch for variant switches (4+ arms)
+
+type Color = { #red; #green; #blue; #yellow };
+
+func colorName(c : Color) : Text =
+  switch c {
+    case (#red)    "red";
+    case (#green)  "green";
+    case (#blue)   "blue";
+    case (#yellow) "yellow";
+  };
+
+assert (colorName (#red)    == "red");
+assert (colorName (#green)  == "green");
+assert (colorName (#blue)   == "blue");
+assert (colorName (#yellow) == "yellow");
+
+type Weekday = { #Mon; #Tue; #Wed; #Thu; #Fri; #Sat; #Sun };
+
+func isWeekend(d : Weekday) : Bool =
+  switch d {
+    case (#Mon) false;
+    case (#Tue) false;
+    case (#Wed) false;
+    case (#Thu) false;
+    case (#Fri) false;
+    case (#Sat) true;
+    case (#Sun) true;
+  };
+
+assert (not (isWeekend (#Mon)));
+assert (not (isWeekend (#Fri)));
+assert (isWeekend (#Sat));
+assert (isWeekend (#Sun));
+
+// Same dispatch expressed with or-patterns — exercises same-body arm merging
+func isWeekendOr(d : Weekday) : Bool =
+  switch d {
+    case (#Mon or #Tue or #Wed or #Thu or #Fri) false;
+    case (#Sat or #Sun) true;
+  };
+
+assert (not (isWeekendOr (#Mon)));
+assert (not (isWeekendOr (#Fri)));
+assert (isWeekendOr (#Sat));
+assert (isWeekendOr (#Sun));
+
+// Variant with payload — sub-pattern binding still works
+type Shape = { #circle : Float; #rect : (Float, Float); #tri : Float; #dot : () };
+
+func area(s : Shape) : Float =
+  switch s {
+    case (#circle r)   r * r * 3.14159;
+    case (#rect(w, h)) w * h;
+    case (#tri b)      b * b * 0.5;
+    case (#dot _)      0.0;
+  };
+
+assert (area (#dot ()) == 0.0);
+assert (area (#rect(3.0, 4.0)) == 12.0);