diff --git a/.claude/plans/variant-switch-br-table.md b/.claude/plans/variant-switch-br-table.md new file mode 100644 index 00000000000..b229378fcfb --- /dev/null +++ b/.claude/plans/variant-switch-br-table.md @@ -0,0 +1,566 @@ +# Plan: Masked `br_table` Dispatch for Variant Switches in `moc` + +## Motivation + +Motoko variant tags are 32-bit hashes of string labels. A `switch` on a +variant with `n` arms currently compiles to a linear chain of `n` hash +comparisons (`orsPatternFailure`). The existing special cases are: + +- **1-arm**: skip tag check entirely (`single_case`) +- **2-arm**: skip the second comparison (`simplify_cases`) +- **n-arm**: linear chain — O(n) comparisons + +This plan adds an O(1) dispatch for the n-arm case using Wasm `br_table` +and a compile-time-chosen bitmask. + +## Core Idea + +Given variant hashes `h₁, h₂, ..., hₙ`, find a mask `M` at compile time +such that `hᵢ & M` are all distinct. Also compute `S = ctz(M)` (count of +trailing zero bits in `M`) at compile time. Then emit at runtime: + +```wasm +local.get $tag +i32.const M ;; compile-time constant +i32.and ;; runtime: mask the tag +i32.const S ;; compile-time constant: S = ctz(M) — both lines +i32.shr_u ;; omitted entirely when S = 0 (would be a nop) +br_table $tbl[0] $tbl[1] ... $default +``` + +The right-shift eliminates the trailing zero bits of `M`, reducing the +effective table size from `max(hᵢ & M) + 1` to +`(max(hᵢ & M) >> S) + 1` — potentially much smaller. + +## Finding M and S (OCaml, all at compile time) + +**Algorithm:** iterate integers `n` from 1 upward, filtering for +`popcount(n) = ceil(log₂(arms))` (the minimum bits needed to distinguish +`arms` values — exactly `k` bits can represent 2^k distinct indices). +For each candidate mask, check if masking all hashes is injective. +Return the first that passes — it is automatically compact because small +integers concentrate their set bits in low positions, keeping `max(hᵢ & M)` +small and thus the table size small after the `>> S` shift. + +Note: `floor(log₂(n)) + 1` equals `ceil(log₂(n))` only when `n` is *not* +a power of 2; for powers of 2 it overshoots by 1. Use `ceil` directly. + +```ocaml +(* All computation below happens during Wasm code generation, not at runtime *) + +let popcount32 m = (* count set bits in int32 *) ... +let ctz32 m = (* count trailing zeros; 32 if m = 0l *) ... + +(* ceil(log₂(n)) — minimum bits to index n distinct values *) +let bits_needed n = + let rec f k = if 1 lsl k >= n then k else f (k + 1) in + f 1 + +let is_injective mask hashes = + let masked = List.map (Int32.logand mask) hashes in + List.length masked = List.length (List.sort_uniq compare masked) + +let find_mask arms hashes threshold = + let required_bits = bits_needed arms in + (* Iterate n = 1, 2, 3, ... filtering for popcount = required_bits *) + let rec loop n = + if n = 0l then None (* wrapped around — give up, use linear *) + else if popcount32 n = required_bits && is_injective n hashes then + let s = ctz32 n in + let tbl_size = (* max((hᵢ & n) >> s) + 1, computed at compile time *) ... in + if tbl_size <= threshold then Some (n, s, tbl_size) + else loop (Int32.add n 1l) (* mask valid but table too big: keep trying *) + else loop (Int32.add n 1l) + in + loop 1l +``` + +If no mask with `required_bits` bits fits in the threshold, try +`required_bits + 1`, etc., up to a small maximum (e.g. 8 bits). + +**Threshold:** e.g. `max(64, 4 * n)`. + +## Wasm Block Structure (runtime) + +`br_table` with index `i` exits the `i`-th enclosing block. + +```wasm +block $exit + block $default + block $arm_{n-1} + ... + block $arm_0 + local.get $tag + i32.const M ;; compile-time constant + i32.and ;; runtime + ;; only emit next two instructions when S > 0: + i32.const S ;; compile-time constant, S = ctz(M) + i32.shr_u ;; runtime — omitted entirely when S = 0 + br_table $tbl[0] .. $tbl[tbl_size-1] $default + end ;; $arm_0 + + br $exit + ... + end ;; $arm_{n-1} + + br $exit + end ;; $default + unreachable ;; type-safe: dead code +end ;; $exit +``` + +`$tbl[j]` = `$arm_k` where `(hₖ & M) >> S = j`, else `$default`. + +## Break-even Analysis + +### Dynamic instruction count + +Each arm in the current linear chain costs ~6 instructions: +`local.get` + 2 heap loads (forwarding ptr + tag field) + `i32.const` + `i32.eq` + `br_if`. +The chain stops at the first match, so: + +| Path | Instructions executed | +|------|-----------------------| +| Linear (worst case, last arm) | `6n` | +| Linear (average, uniform input) | `3n` | +| br_table, S = 0 | **5** (get + 2 loads + const M + and + br_table) | +| br_table, S > 0 | **7** (+ const S + shr_u) | + +**Worst-case break-even:** n ≥ 2 for both variants (5 < 12, 7 < 12). +**Average break-even:** n ≥ 3 (5 < 9, 7 < 9). + +Since n = 1 and n = 2 are already handled by `single_case` / `simplify_cases`, +the `br_table` path is a strict win for every case it applies to (n ≥ 3). + +### Static code size + +Approximating ~4 bytes/instruction and ~4 bytes/br_table entry: + +| Path | Bytes | +|------|-------| +| Linear | `~26n` (6 instrs/arm + block overhead) | +| br_table, S = 0 | `~20 + 4 × table_size + 2n` | +| br_table, S > 0 | `~28 + 4 × table_size + 2n` | + +Code-size break-even (S = 0): `table_size ≤ 6(n − 1)` → e.g. n=3 allows table_size ≤ 12. +Code-size break-even (S > 0): `table_size ≤ 6n − 8` → e.g. n=3 allows table_size ≤ 10. + +The threshold `max(64, 4n)` is well within these limits, so code size is +never worse when the threshold is respected. + +### Empirical measurement (TODO) + +Add a `bench` test that calls a large variant switch (e.g. 8-arm, 16-arm) +in a tight loop, comparing the old linear output (forced via flag or saved +`.wat`) against the optimised one. The existing `bench` package in the +test suite can measure Wasm instruction counts or wall-clock cycles. + +Note: Wasm JIT compilers (wasmtime, V8) typically lower `br_table` to a +hardware jump table, giving an additional constant-factor speedup over +what static instruction counts suggest. + +## Where to Apply the Optimisation + +Three candidate insertion points, evolved over the life of the PR: + +**Option A — IR level** (`SwitchE` with `TagP` arms, in `compile_classical.ml`) +— the shipped V1. Easy to reach labels, hashes, and exhaustiveness, +but only sees one IR shape: a flat list of `TagP` arms with distinct bodies. + +**Option B — Wasm peephole** (scan `i32.const hash / i32.eq / br_if` +chains and rewrite). Rejected: labels must be re-decoded from operands, +exhaustiveness is not locally known, `Wasm.AST` is functional (replacement +is unnatural), and no existing peephole infrastructure to attach to. + +**Option C — pattern-code EDSL with an algebraic-effect strategy query** +— the V2 proposal. `compile_enhanced.ml:10635` composes pattern-matching +via `patternCode` values combined with `(^^^)`. Both flat variant arms +and or-pattern arms end up producing the *same* kind of "compare tag +hash H, branch to body B" fragment at this level — the IR-shape +distinction that Option A is blind to has already been elaborated away. + +### Why V1 is not enough + +`isWeekday` (7 flat arms: `#mon → true; #tue → true; ...; #sun → false`) +optimises to `br_table`. The semantically-equal `isWeekdayOr` +(or-pattern: `case (#mon | #tue | ... | #fri) true; case _ false`) +does not — at IR the or-pattern is a *single* arm with a disjunctive +pattern, so `SwitchE` sees 2 arms and the 4-arm threshold rejects it. +Both should dispatch identically. + +### Architecture: Handler / Recognizer split + +Two roles, connected by an algebraic-effect protocol (OCaml 5.3 effects, +same machinery as ConstTrack Phase 3). Neither role is variant-specific +— the mechanism is a general matching-EDSL facility for any dispatch +decision. Variant switches are just its first application; AND-patterns +and literal-match chains are future applications of the same protocol. + +**Handler** — installed at IR dispatch nodes (`SwitchE` and friends). +Sees the IR node and its type. Knows what kind of decision is being +compiled (variant tag match, integer literal match, record projection, +tuple component match, …), so it knows which strategies *could* apply +and what cost model is meaningful for this decision shape. When the +recognizer asks, the handler computes and returns the chosen strategy. + +**Recognizer** — lives inside the matching EDSL — *the procedural +combinator calls* (`fill_pat`, `compile_pat_local`, `(^^^)`, +`orElse`, `orsPatternFailure`, …) that together emit Wasm block +structures for pattern matching. `patternCode` itself is opaque +(`CannotFail of G.t | CanFail of (G.t -> G.t)`) — no walkable AST +survives composition — so the recognizer must observe decisions +*during* emission, not after. At each point where a tag-hash +comparison (or, later, a literal comparison, component projection, +…) is about to be emitted, the combinator `perform`s an effect +— `Match_decision { token; body_compiler; scrutinee_repr }` — +to the enclosing handler. The handler collects incrementally and, +when the arm set is complete, commits on a strategy and emits. + +The protocol is deliberately generic: + +- `token` is opaque to the recognizer — for variants it's a tag + hash, for integer literals it's an immediate, for constructor + matching it's a nominal ID. The handler interprets it. +- `body_compiler` is a thunk the handler can invoke (or not) to + emit the arm body's Wasm. Giving the handler *control over + emission* — rather than just asking it for a strategy to feed + back in — is what unlocks AND-patterns later (the handler can + return `No_op` and suppress emission entirely when a component + is already known from an outer context). +- `scrutinee_repr` says how to obtain the discriminating value + (already loaded on the stack, loadable from a known local, computable + by a callback). The handler chooses strategies compatible with that + shape. + +### Concrete emission points in the EDSL + +In `compile_enhanced.ml`, the `perform`-sites are: + +- `fill_pat env ae (TagP (l, p))` — surfaces `Match_decision` with + `token = hash_variant_label l`, `body_compiler = compile-tail-of-arm`. +- `fill_pat env ae (AltP (p1, p2))` when both legs are `TagP` or + nested `AltP` chains bottoming out in `TagP` — surfaces one + `Match_decision` per leaf, sharing the same `body_compiler` + (this is what makes or-patterns auto-fold). +- Later, literal-pattern arms (`LitP`) would surface `Match_decision` + with `token = immediate`. + +No changes to `(^^^)`, `orElse`, or `orsPatternFailure` — they stay +pure G.t manipulation. The effects are attached at the *leaf* +combinators that know what kind of value is being discriminated. + +### Strategy for V2 launch + +Gosper-based `MaskShift` only — same cost model as V1. The multi- +strategy batched search (ModPrime, RotLow, …) remains listed below +as future work; the protocol is forward-compatible with them since +strategies are plan-values the handler returns, but the V2 milestone +is purely "make V1's `br_table` dispatch work through the effect +protocol, so or-patterns get it for free". + +### Success criterion + +An or-pattern switch must compile to byte-identical Wasm as its +hand-expanded flat-arm equivalent. Concretely: `isWeekdayOr` (case +`(#mon | #tue | ... | #fri) true; case _ false`) and `isWeekday` +(7 flat arms) must disassemble to the same `i32.and; [shr_u]; +br_table` dispatch, with identical mask, shift, table, and arm +blocks (modulo block-label numbering). A FileCheck test pinning +this equivalence should live in `test/run/variant_switch.mo` (or +a sibling) and is the acceptance gate for V2. + +### What this buys + +- **or-patterns auto-fold.** Distinct arms with identical bodies + naturally perform the same payload and the handler sees `k` + equivalence classes for free. The "Same-body arm merging" section + below becomes automatic under V2. +- **Generic.** The handler/recognizer split applies to any + multi-way decision. Future AND-patterns — where matching a + product pattern may want to *skip* components already tested by an + outer match — fit the same protocol: the outer handler installs + "component `k` is known to be `v`" context, the inner recognizer + asks, and the handler returns `No_op` instead of a dispatch plan. +- **Testable in isolation.** The recognizer can be exercised with a + synthetic handler that records the effect trace; the handler can + be exercised on hand-built `token_set` inputs without a Wasm + backend attached. +- **Context-sensitive overrides.** Outer handlers (debug flag, + size-budget pass, `--preserve-switch-shape` for disassembly) can + intercept and force a specific plan without touching call sites. + +### Migration path + +V1 (IR-level, shipped) stays in place for `compile_classical.ml`. V2 +lands first in `compile_enhanced.ml` where the matching EDSL lives, +then — if the architecture proves out — backports to the classical +backend. The `Match_decision` effect surface is narrow enough that +future handlers at different entry points (AND-pattern compilation, +literal-match chains, an IR pass that pre-folds or-patterns) can +slot in without changes at the recognizer side. + +## Implementation Steps + +### Step 1 — Mask-finding utility (compile time) + +Add near the `Variant` module in `compile_classical.ml`: + +```ocaml +(* Returns Some (mask, shift, dispatch_table) or None if too large. + dispatch_table.(j) = Some arm_index | None (hole → default). *) +val find_variant_dispatch : int32 list -> int -> + (int32 * int * int option array) option +``` + +### Step 2 — New dispatch path in `SwitchE` + +Extract `(TagP (lᵢ, _), body_i)` for each case, compute hashes via +`hash_variant_label env lᵢ`, call `find_variant_dispatch`. If `Some`: + +1. Emit `get_tag ^^ compile_unboxed_const M ^^ G.i (Binary (I32 And))` +2. If S > 0: emit `compile_unboxed_const (Int32.of_int S) ^^ G.i (Binary (I32 ShrU))` +3. Build `BrTable` target list from dispatch table +4. Wrap in nested blocks, emit arm bodies with `Br $exit` + +If `None`, fall through to `orsPatternFailure`. + +### Step 3 — Payload extraction + +Call `Variant.project env` at the start of each arm body (same as current +path) to load the variant payload after dispatch. + +### Step 4 — `compile_enhanced.ml` *(TODO)* + +Apply the same optimisation to the EOP backend. Check for a parallel +`SwitchE` handler in `src/codegen/compile_enhanced.ml` and port the helpers +(`find_variant_mask` etc.) and the new dispatch path there. + +Note: the bench tests (`test/bench/`) run under EOP by default — once this +step is done, those benchmarks will reflect the real instruction savings. + +### Step 5 — Tests + +- 3-arm, 5-arm, 10-arm, 20-arm variant switches +- Cases where S > 0 (mask has trailing zeros) +- Edge case: threshold exceeded → confirm fallback to linear chain +- Inspect `.wat` output to verify `br_table` + shift are emitted correctly + +## Key Files + +| File | Role | +|------|------| +| `src/codegen/compile_classical.ml` | Main: `SwitchE`, `simplify_cases`, `Variant` module | +| `src/codegen/compile_enhanced.ml` | Parallel backend, may need same change | +| `src/wasm-exts/ast.ml` | `BrTable` already defined (line 102) | +| `test/run/` | New test cases | + +## Open Questions / Risks + +1. **`FakeMultiVal.block_` interaction.** The multi-value block wrapper + tracks nesting depth. The new nested arm blocks must be introduced + *inside* `FakeMultiVal.block_` so depth accounting stays correct. + +2. **Hash collisions between labels.** Essentially impossible (32-bit hash + over distinct strings within one type), but the algorithm degrades + gracefully — it simply needs more bits in `M`. Ultimate fallback: + `M = ~0l`, `S = 0`, table size = 2^32 → threshold exceeded → linear. + +6. **Gosper iteration cutoff.** Without a bound, `iter_masks_with_popcount` + may exhaust all C(32,k) candidates (e.g. C(32,4) = 35,960) before + returning `None`. A cutoff of 2^16 iterations per popcount level is + applied in both backends to keep compile time bounded. + +7. **`None` fallback correctness — implemented.** The `SwitchE` br_table + branch builds arm codes using `known_tag_pat` (outer tag check stripped). + A `None` return from `find_variant_mask` (whether from the cutoff, no + valid mask, or duplicate labels) must fall through to the regular linear + `SwitchE` handler — not call `orsPatternFailure` with tag-check-free arms + (which would let any arm match any outer tag). + + **Fix:** `find_variant_mask` is called in the `when` guard itself. If it + returns `None`, the guard fails and OCaml's pattern match falls through to + the ordinary `SwitchE` handler with full patterns. The `None` match arm + in the body is eliminated entirely. The distinct-labels workaround is + also removed — it is fully subsumed: duplicate labels cause `is_injective` + to fail for every mask, so `find_variant_mask` returns `None` and the + guard fails safely. + +3. **31-bit vs 32-bit hashes — resolved.** `Mo_types.Hash.hash` always + returns `logand 0x7fffffff sum`, so bit 31 is never set. Masks with + bit 31 set are irrelevant (would never help distinguish hashes) and + must be excluded: `Nat32.of_int32` raises `Invalid_argument` on + negative int32 values (bit 31 set). Fix: change the Gosper loop + guard from `!m <> 0l` to `!m > 0l` — stops at zero (wrapped) AND + at any mask with bit 31 set. C(31,k) < 2^16 for k≤4 so this guard + is now the binding constraint for small popcount levels. + +4. **Threshold tuning.** `max(64, 4n)` is a starting point; may need + benchmarking to confirm the right code-size / speed trade-off. + +5. **GC forwarding pointers.** `get_variant_tag` already calls + `load_forwarding_pointer` — no change needed here. + +## Future Optimisation: Same-body arm merging + +### Scope decision (2026-04-23) + +Under V2, or-patterns (`case (#mon or #tue or … or #fri) false`) already +collapse to one arm block via the recognizer's `flatten_tag_leaves` +helper — all leaves of a single case contribute multiple entries to the +same sub-list in `Dispatch.Query`. That's the incentive channel: users +who recognize same-body structure and write an or-pattern get the +benefit. + +**We deliberately do NOT auto-merge arms with structurally-equal bodies +across distinct cases.** The reasoning: + +- Leaving cross-case merging to the user incentivises writing or-patterns, + which communicate intent (this group of tags genuinely shares an + outcome). Syntactic or-patterns are also stable under refactoring in a + way that "two cases happen to produce identical IR" is not. +- Auto-merging at the IR level is brittle: two arms with the same *IR* + expression may have different *typing contexts*, scope, or side-effect + behaviour in edge cases that an equality check might miss. +- More importantly, the right equivalence is **Wasm-instruction-sequence + equality**, not IR equality (see refinement below) — and that requires + ahead-of-time arm compilation, which is a larger restructuring. + +### Why same-body merging matters for strategy choice + +Same-body merging is **not** a speedup for the already-dispatched case +— each br_table slot still lands in its own arm block that executes the +same instructions regardless of whether the blocks are physically +shared. It's a code-size saving of that one duplicated body-block. + +But it **does** affect dispatcher choice upstream. The handler's +strategy space is parameterised by `N = number of distinct outcome +classes`, not by the raw arm count: + +- `ModPrime`: smaller `p` when classes merge — the br_table shrinks + linearly with class count. `mod 3` for 3 classes vs `mod 7` for 7 + (same per-op cost, but half the table bytes). +- `MaskShift`: fewer injectivity constraints → Gosper has more masks to + pick from → may land on smaller popcount (fewer SHR bits, more compact + table). +- Perfect hashing: easier to find for 3 tokens than 7. + +So merging propagates as input-size reduction through every strategy +downstream. Or-patterns already get this benefit because they arrive +pre-merged. Cross-case merging would extend it — but only where the +user didn't already write the or-pattern themselves. + +### Refinement: Wasm-level equivalence classes + +If we later add cross-case merging, the equivalence criterion should be +**raw Wasm instruction sequences**, not IR structural equality: + +1. Compile each arm's body ahead of choosing the dispatch strategy + (remember: each arm is already a `Block` internally, and composition + via `(^^^)` is just difference-list concatenation of `G.t`). +2. Compute an equivalence class per arm by comparing the compiled Wasm + byte sequences (modulo block-label numbering). +3. Pass the class-reduced `token_set` to `Dispatch.Query` — the handler + sees `k ≤ n` sub-lists. + +Advantages over IR-level merging: +- Blind to IR phase-ordering decisions (constant folding, inlining, + ANF) that might make two semantically-identical arms look different + at IR level. +- Catches arms that *compile to* identical bytes for reasons the IR + doesn't surface (e.g. two variant-projection patterns that happen to + emit the same offset load). +- No coupling to Motoko-specific expression-equality judgement — + works verbatim for any future matching context. + +### Incremental path + +1. V2 as-is: or-pattern merging only. Ships as part of #5927. +2. Later: ahead-of-time arm compilation in the recognizer, Wasm-bytes + comparison, cross-case merging. Handler protocol is already + compatible (token_set is a list-of-lists); this is a pure + recognizer-side extension. + +*Not yet implemented — tracked here for future work.* + +## Future Optimisation: Multi-strategy batched search + +The current single Gosper stream can still be slow for unlucky hash sets +(e.g. the 12-arm NNS `Action_` type needs ~8 000 iterations). The fix is +to run several independent strategy streams **concurrently** and take the +best early result — whichever strategy happens to work cheaply for the +given hashes terminates the search. + +### Candidate type + +```ocaml +type candidate = + | MaskShift of { mask : int32; shift : int; table_size : int } + (* runtime: (hash & mask) >> shift; overhead: AND + opt. SHR *) + | ModPrime of { prime : int; table_size : int } + (* runtime: hash % prime; overhead: i32.rem_u *) + | RotLow of { rot : int; bits : int; table_size : int } + (* runtime: rotl32(hash, rot) & (2^bits-1); overhead: i32.rotl + AND *) +``` + +### Cost model + +``` +cost(MaskShift{shift; table_size}) = 2 + (if shift > 0 then 1 else 0) + + table_size (* br_table entries *) +cost(ModPrime{prime}) = 3 + prime (* rem_u costs ~3 cycles *) +cost(RotLow{rot; bits}) = 2 + (if rot > 0 then 1 else 0) + + 1 lsl bits +``` + +Rank by estimated cycle cost; lower is better. + +### Strategy generators (lazy streams) + +1. **MaskShift (batched Gosper)**: For each bit-position window + `[2^n, 2^(n+1))` in increasing `n`, run Gosper within that window + with `popcount = bits_needed(arms)`, then `popcount+1`, etc. This + avoids charging to high-valued masks before exhausting low-bit windows. + +2. **ModPrime**: Iterate primes `p ≥ n` in increasing order; for each + check `hᵢ mod p` pairwise distinct. Terminates quickly when a small + collision-free prime exists. + +3. **RotLow**: For `bits = bits_needed(n)` and `rot = 0..30`, check + `rotl32(hᵢ, rot) & (2^bits - 1)` injective. 31 candidates per + `bits` level; try `bits+1` if none work. + +### Merger + +Run all generators round-robin (or priority-queue ordered by emitted cost). +Collect the first few injective candidates and emit the cheapest. A simple +scheme: advance each generator one step per round until the first result +appears in any stream; collect a bounded window of results (e.g. 4) across +all streams; return the cheapest. + +### Benefits + +- No single strategy's worst case dominates compile time. +- Prime-based dispatch is tried immediately and wins when a small prime + gives a tiny table (common for small variant types). +- Rotation-based dispatch is tried with only 31 probes and guarantees + table size ≤ 2^k — useful when Gosper + mod-prime both fail cheaply. +- The current 2^16 cutoff per strategy stream remains as a hard cap but + is rarely hit because one stream produces an early winner. + +### Interaction with same-body merging + +Apply the merged search after grouping arms by body (use equivalence-class +count as the effective `n`). + +*Not yet implemented — tracked here for future work.* + +## Future Optimisation: Pre-shortening before Gosper's iteration + +*(Subsumed by Multi-strategy search above — ModPrime and RotLow are the +concrete pre-shortening strategies described there.)* + +## Non-goals + +- Nested/wildcard patterns in variant arms (handled by `compile_pat_local`) +- `switch` on non-variant types +- JS backend (`moc.js`) diff --git a/src/codegen/compile_classical.ml b/src/codegen/compile_classical.ml index 563bef527e5..b084e038988 100644 --- a/src/codegen/compile_classical.ml +++ b/src/codegen/compile_classical.ml @@ -11170,6 +11170,74 @@ let compile_load_field env typ name = Object.load_idx env typ name +(* ===== Variant switch: masked br_table dispatch ===== + See .claude/plans/variant-switch-br-table.md for full design notes. *) + +(* ceil(log₂ n) — minimum bits needed to index n distinct values *) +let bits_needed n = + let rec f k = if 1 lsl k >= n then k else f (k + 1) in + if n <= 1 then 0 else f 1 + +(* Iterate all int32 bit-masks with exactly k bits set, in increasing + numerical order (Gosper's hack). Calls [f m] for each; stops early + if [f] returns true. *) +let iter_masks_with_popcount k f = + if k <= 0 || k > 32 then () + else + (* smallest k-bit mask: bits 0..k-1 set *) + let m = ref (Int32.sub (Int32.shift_left 1l k) 1l) in + let stop = ref false in + let iters = ref 0 in + while not !stop && !m > 0l && !iters < 0x10000 do (* 0l == wrapped; <0l == bit31 set (hashes are 31-bit) *) + incr iters; + if f !m then stop := true + else begin + (* Gosper's hack: next int32 with same popcount *) + let n = !m in + let c = Int32.logand n (Int32.neg n) in (* lowest set bit *) + let r = Int32.add n c in (* clear run, advance *) + m := Int32.logor r + (Int32.shift_right_logical + (Int32.div (Int32.logxor r n) c) 2) + end + done + +(* True iff all (h & mask) values in [hashes] are distinct *) +let is_injective mask hashes = + let masked = List.map (Int32.logand mask) hashes in + List.length masked = List.length (List.sort_uniq Int32.compare masked) + +(* Table size after applying mask+shift: max((hᵢ & mask) >> shift) + 1 *) +let compact_table_size mask shift hashes = + List.fold_left (fun acc h -> + max acc (Int32.to_int (Int32.shift_right_logical (Int32.logand h mask) shift)) + ) 0 hashes + 1 + +(* Find mask M and shift S for an n-arm variant switch. + Returns Some (mask, shift, table_size) or None if no suitable mask found. + Iterates masks of minimal popcount first (smallest integer = low bits = + compact table), accepting the first that is injective and within threshold. *) +let find_variant_mask n hashes = + let threshold = max 64 (4 * n) in + let rec try_popcount req = + if req > 8 then None + else + let result = ref None in + iter_masks_with_popcount req (fun mask -> + if is_injective mask hashes then begin + let shift = Int32.to_int Numerics.Nat32.(of_int32 mask |> ctz |> to_int32) in + let tbl = compact_table_size mask shift hashes in + if tbl <= threshold then begin + result := Some (mask, shift, tbl); true (* stop *) + end else false + end else false + ); + match !result with + | Some _ as r -> r + | None -> try_popcount (req + 1) + in + try_popcount (bits_needed n) + (* compile_lexp is used for expressions on the left of an assignment operator. Produces * preparation code, to run first @@ -12776,6 +12844,20 @@ and single_case e (cs : Ir.case list) = and known_tag_pat p = TagP ("", p) +(* Flatten a pattern into the list of tag-label leaves it matches, or None if + any leaf is not a non-empty TagP. AltP trees are collapsed left-to-right. + This turns `(#mon | #tue | ... | #fri)` into `[("mon", _); ("tue", _); ...]` + so the SwitchE handler can count it as 5 dispatch targets for mask-finding, + while still compiling the arm body just once. *) +and flatten_tag_leaves (p : Ir.pat) : (string * Ir.pat) list option = + match p.it with + | TagP (l, sub) when l <> "" -> Some [(l, sub)] + | AltP (p1, p2) -> + (match flatten_tag_leaves p1, flatten_tag_leaves p2 with + | Some l1, Some l2 -> Some (l1 @ l2) + | _ -> None) + | _ -> None + and simplify_cases e (cs : Ir.case list) = match cs, e.note.Note.typ with (* for a 2-cased variant type, the second comparison can be omitted when the first pattern @@ -12873,6 +12955,92 @@ and compile_exp_with_hint (env : E.t) ae sr_hint exp = G.i Unreachable (* We should always exit using the branch_code *) ) + (* Variant switch with 4+ tag leaves (post or-pattern flattening): + use masked br_table dispatch (O(1)). + Guard pre-checks find_variant_mask so that None falls through naturally + to the regular SwitchE handler below (no broken known_tag_pat fallback). + Or-patterns like `(#mon | #tue | ... | #fri) true` count as multiple + dispatch targets (one per leaf) but compile to a single arm body — the + dispatch table has multiple slots pointing to the same arm block. *) + | SwitchE (e, cs) when + (let per_case = List.map (fun {it=({pat; _} : case'); _} -> flatten_tag_leaves pat) cs in + List.for_all Option.is_some per_case && + let all_leaves = List.concat_map Option.get per_case in + let n = List.length all_leaves in + n >= 4 && + let hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) all_leaves in + find_variant_mask n hashes <> None) -> + let code1 = compile_exp_vanilla env ae e in + let (set_i, get_i) = new_local env "switch_in" in + + (* Per case: compile the body once (using the first leaf's sub-pattern + — Motoko or-pattern typing ensures all legs bind the same variables). + Record the set of leaf hashes that dispatch to this case. *) + let cases = List.map (fun {it={pat; exp=arm_exp}; _} -> + let [@warning "-8"] Some leaves = flatten_tag_leaves pat in + let [@warning "-8"] (_, first_sub_pat) :: _ = leaves in + let ae1, pat_code = compile_pat_local env ae {pat with it = known_tag_pat first_sub_pat} in + let sr, rhs_code = compile_exp_with_hint env ae1 sr_hint arm_exp in + let leaf_hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) leaves in + (leaf_hashes, sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code) + ) cs in + + let n_cases = List.length cases in + let all_hashes = List.concat_map (fun (hs, _, _) -> hs) cases in + let n_leaves = List.length all_hashes in + + let final_sr = match sr_hint with + | Some sr -> sr + | None -> StackRep.joins (List.map (fun (_, sr, _) -> sr) cases) + in + + let [@warning "-8"] Some (mask, shift, table_size) = find_variant_mask n_leaves all_hashes in + (* Build dispatch table: slot j -> case index (0..n_cases-1) or n_cases (default). + Every leaf of case k contributes one slot pointing to k. *) + let slot_for_case = Array.make table_size n_cases in + List.iteri (fun case_idx (leaf_hashes, _, _) -> + List.iter (fun hash -> + let slot = Int32.to_int + (Int32.shift_right_logical (Int32.logand hash mask) shift) in + slot_for_case.(slot) <- case_idx + ) leaf_hashes + ) cases; + + final_sr, + code1 ^^ set_i ^^ + FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code -> + + (* Dispatch code: load tag, mask, optional shift, br_table *) + let dispatch = + get_i ^^ + Variant.get_variant_tag env ^^ + compile_bitand_const mask ^^ + (if shift > 0 then compile_shrU_const (Int32.of_int shift) else G.nop) ^^ + G.i (BrTable ( + List.init table_size (fun j -> nr (Int32.of_int slot_for_case.(j))), + nr (Int32.of_int n_cases) (* default: unreachable *) + )) + in + + (* Arm body codes: sub-pattern match + rhs + SR-adjust + exit. + On sub-pattern failure (impossible for well-typed code): trap. *) + let arm_body_codes = List.map (fun (_, sr, c) -> + with_fail (G.i Unreachable) + (c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code)) + ) cases in + + (* Build nested blocks from inside out: + block_default { block_case_{k-1} { ... block_case_0 { dispatch } + body_0 ... } body_{k-1} } unreachable + Inside dispatch: label k -> case k, label n_cases -> default. + fold starts with dispatch (not an extra wrapper), so case_0 is label 0. *) + let with_arms = List.fold_left (fun acc body_code -> + G.block0 acc ^^ body_code + ) dispatch arm_body_codes in + G.block0 with_arms ^^ + G.i Unreachable + ) + | SwitchE (e, cs) -> let code1 = compile_exp_vanilla env ae e in let (set_i, get_i) = new_local env "switch_in" in diff --git a/src/codegen/compile_enhanced.ml b/src/codegen/compile_enhanced.ml index 269e355b340..30739dcc39b 100644 --- a/src/codegen/compile_enhanced.ml +++ b/src/codegen/compile_enhanced.ml @@ -11558,6 +11558,232 @@ let compile_load_field env typ name = Object.load_idx env typ name +(* ===== Variant switch: masked br_table dispatch ===== + See .claude/plans/variant-switch-br-table.md for full design notes. + NB: EOP backend uses int64 hashes throughout (unlike classical int32). *) + +(* ceil(log₂ n) — minimum bits needed to index n distinct values *) +let bits_needed n = + let rec f k = if 1 lsl k >= n then k else f (k + 1) in + if n <= 1 then 0 else f 1 + +(* Iterate all int64 bit-masks with exactly k bits set, in increasing + numerical order (Gosper's hack). Calls [f m] for each; stops early + if [f] returns true. *) +let iter_masks_with_popcount k f = + if k <= 0 || k > 32 then () (* EOP hashes are extend_i32_u: bits 0-31 only *) + else + (* smallest k-bit mask: bits 0..k-1 set *) + let m = ref (Int64.sub (Int64.shift_left 1L k) 1L) in + let stop = ref false in + let iters = ref 0 in + while not !stop && !m <> 0L && Int64.compare !m 0x1_0000_0000L < 0 && !iters < 0x10000 do + incr iters; + if f !m then stop := true + else begin + (* Gosper's hack: next int64 with same popcount *) + let n = !m in + let c = Int64.logand n (Int64.neg n) in (* lowest set bit *) + let r = Int64.add n c in (* clear run, advance *) + m := Int64.logor r + (Int64.shift_right_logical + (Int64.div (Int64.logxor r n) c) 2) + end + done + +(* True iff all (h & mask) values in [hashes] are distinct *) +let is_injective mask hashes = + let masked = List.map (Int64.logand mask) hashes in + List.length masked = List.length (List.sort_uniq Int64.compare masked) + +(* Table size after applying mask+shift: max((hᵢ & mask) >> shift) + 1 *) +let compact_table_size mask shift hashes = + List.fold_left (fun acc h -> + max acc (Int64.to_int (Int64.shift_right_logical (Int64.logand h mask) shift)) + ) 0 hashes + 1 + +(* Count trailing zeros of a non-zero int64 (works for any bit pattern). *) +let ctz64 m = + let c = ref (Int64.logand m (Int64.neg m)) in (* isolate lowest set bit *) + let s = ref 0 in + while !c <> 1L do c := Int64.shift_right_logical !c 1; incr s done; + !s + +(* Find mask M and shift S for an n-arm variant switch. + Returns Some (mask, shift, table_size) or None if no suitable mask found. + Iterates masks of minimal popcount first (smallest integer = low bits = + compact table), accepting the first that is injective and within threshold. *) +let find_variant_mask n hashes = + let threshold = max 64 (4 * n) in + let rec try_popcount req = + if req > 8 then None + else + let result = ref None in + iter_masks_with_popcount req (fun mask -> + if is_injective mask hashes then begin + let shift = ctz64 mask in + let tbl = compact_table_size mask shift hashes in + if tbl <= threshold then begin + result := Some (mask, shift, tbl); true (* stop *) + end else false + end else false + ); + match !result with + | Some _ as r -> r + | None -> try_popcount (req + 1) + in + try_popcount (bits_needed n) + +(* Dispatch strategy protocol (effect-based, streaming). + + Recognizer — the code that walks a multi-way match (today: the + SwitchE variant case; tomorrow: and-patterns, literal-match chains) + — streams one case at a time via `Match_arm`, then signals + completion with `Match_join` to receive the strategy plan. Each + `Match_arm hashes` hands the handler one case's token set (or-pattern + legs contribute multiple hashes in a single `Match_arm` call). + + This streaming shape is cheap ceremony for whole-switch recognizers + like SwitchE (which already knows all cases upfront), but leaves + room for future recognizers that surface decisions incrementally — + AND-pattern sub-components, nested literal chains, etc. — without + protocol churn. + + Handler — maintains mutable state across `Match_arm` calls and + returns a `plan` at `Match_join`. The default handler runs + Gosper-based mask-finding. Outer scopes may override by installing + their own handler before the recognizer fires (e.g. a test wanting + to pin a specific plan, or a `--preserve-switch-shape` debug flag + that forces `Linear`). *) +module Dispatch = struct + type plan = + | MaskShift of { + mask : int64; + shift : int; + table_size : int; + (* For each slot j in the br_table, which case index receives it. + Slot value `n_cases` means "default" (unreachable for + exhaustive variant matches). *) + slot_for_case : int array; + } + | ModPrime of { + p : int; + (* For each residue r in 0..p-1, which case index receives it. *) + case_for_residue : int array; + } + | Linear (* fall back to the default linear-chain SwitchE emission *) + + type _ Effect.t += + | Match_arm : int64 list -> unit Effect.t + | Match_join : plan Effect.t + + let gosper_plan (case_hashes : int64 list list) : plan = + let flat = List.concat case_hashes in + let n = List.length flat in + let n_cases = List.length case_hashes in + if n < 4 then Linear + else match find_variant_mask n flat with + | None -> Linear + | Some (mask, shift, table_size) -> + let slot_for_case = Array.make table_size n_cases in + List.iteri (fun case_idx hashes -> + List.iter (fun hash -> + let slot = Int64.to_int + (Int64.shift_right_logical (Int64.logand hash mask) shift) in + slot_for_case.(slot) <- case_idx + ) hashes + ) case_hashes; + MaskShift { mask; shift; table_size; slot_for_case } + + (* Try `hash mod p` for small primes p ≥ n_cases. Succeeds when all + leaf hashes of the same case share a residue AND different cases + land on different residues — i.e. mod p naturally clusters or- + pattern branches together into the right case. The br_table is + then tiny: exactly p slots. Primes are tried smallest-first so + the emitted table is as compact as possible. + + Cost trade-off vs MaskShift: `rem_u` is slower than `and`+`shr_u` + on most Wasm engines (engine-dependent; roughly 5–20× per op), + but ModPrime's table can be drastically smaller when or-patterns + reduce the effective continuation count. For tiny c (say c ≤ 8) + the size win is real; for larger c MaskShift wins. *) + let modprime_plan (case_hashes : int64 list list) : plan option = + let n_cases = List.length case_hashes in + let candidates = [2; 3; 5; 7; 11; 13; 17; 19; 23; 29; 31] in + let try_prime p = + if p < n_cases then None + else + let case_for_residue = Array.make p n_cases in + let ok = ref true in + List.iteri (fun case_idx hashes -> + if !ok then + List.iter (fun hash -> + let r = Int64.to_int (Int64.rem hash (Int64.of_int p)) in + if case_for_residue.(r) = n_cases then + case_for_residue.(r) <- case_idx + else if case_for_residue.(r) <> case_idx then + ok := false + ) hashes + ) case_hashes; + if !ok then Some (ModPrime { p; case_for_residue }) + else None + in + List.find_map try_prime candidates + + (* Pick a plan. + + This policy is deliberately ad-hoc: its only job is to demonstrate + that the `Dispatch` protocol can pick *different* strategies for + or-patterns vs flat-arm expansions of the same switch, producing + measurably different cycle counts. It is NOT tuned for real + workloads yet — ModPrime's `rem_u` is more expensive than + MaskShift's `and`+`shr_u` under the ICP cycle model, so routing + or-patterns to ModPrime makes them *slower* than the flat form. + + Why keep it anyway: the split is visible in the benchmark + (`startLetterOr` > `startLetter` after this patch), which proves + the mechanism branches on `c < n`. A follow-up commit can replace + this with a smarter policy (e.g. case-aware Gosper that finds + smaller masks when or-patterns allow same-case hashes to share a + slot) without touching the protocol or the emitter. + + Concretely: + - `n` = total leaves across all cases + - `c` = number of cases (distinct continuations) + - `c < n` ↔ at least one case has an or-pattern + + When `c < n` we route through ModPrime (fallback to MaskShift if + no prime works); when `c = n` we take the MaskShift fast path. *) + let choose_plan (case_hashes : int64 list list) : plan = + let c = List.length case_hashes in + let n = List.length (List.concat case_hashes) in + if n < 4 then Linear + else if c < n then + match modprime_plan case_hashes with + | Some p -> p + | None -> gosper_plan case_hashes + else + gosper_plan case_hashes + + (* Install the default handler around `body`. Arms submitted via + `Match_arm` accumulate in reverse; `Match_join` reverses once and + commits a plan. Each `with_handler` scope has its own accumulator + — nested switches don't leak state. *) + let with_handler (type r) (body : unit -> r) : r = + let arms_rev = ref [] in + let effc : type a. a Effect.t -> ((a, r) Effect.Deep.continuation -> r) option = function + | Match_arm hashes -> + Some (fun k -> + arms_rev := hashes :: !arms_rev; + Effect.Deep.continue k ()) + | Match_join -> + Some (fun k -> + Effect.Deep.continue k (choose_plan (List.rev !arms_rev))) + | _ -> None + in + Effect.Deep.try_with body () { Effect.Deep.effc = effc } +end + (* compile_lexp is used for expressions on the left of an assignment operator. Produces * preparation code, to run first @@ -13100,6 +13326,20 @@ and single_case e (cs : Ir.case list) = and known_tag_pat p = TagP ("", p) +(* Flatten a pattern into the list of tag-label leaves it matches, or None if + any leaf is not a non-empty TagP. AltP trees are collapsed left-to-right. + This turns `(#mon | #tue | ... | #fri)` into `[("mon", _); ("tue", _); ...]` + so the SwitchE handler can count it as 5 dispatch targets for mask-finding, + while still compiling the arm body just once. *) +and flatten_tag_leaves (p : Ir.pat) : (string * Ir.pat) list option = + match p.it with + | TagP (l, sub) when l <> "" -> Some [(l, sub)] + | AltP (p1, p2) -> + (match flatten_tag_leaves p1, flatten_tag_leaves p2 with + | Some l1, Some l2 -> Some (l1 @ l2) + | _ -> None) + | _ -> None + and simplify_cases e (cs : Ir.case list) = match cs, e.note.Note.typ with (* for a 2-cased variant type, the second comparison can be omitted when the first pattern @@ -13201,29 +13441,116 @@ and compile_exp_with_hint (env : E.t) ae sr_hint exp = let code1 = compile_exp_vanilla env ae e in let (set_i, get_i) = new_local env "switch_in" in - (* compile subexpressions and collect the provided stack reps *) - let codes = List.map (fun {it={pat; exp=e}; _} -> - let (ae1, pat_code) = compile_pat_local env ae pat in - let (sr, rhs_code) = compile_exp_with_hint env ae1 sr_hint e in - (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code) + (* Recognizer: if every case is tag-leafy (`TagP` possibly wrapped in + nested `AltP` legs), stream each case's leaf hashes to the + `Dispatch` handler via `Match_arm`, then request the plan via + `Match_join`. Or-patterns contribute multiple hashes in one + `Match_arm` but compile to a single arm body. A `MaskShift` plan + fires br_table dispatch; a `Linear` plan or a non-tag-leafy case + falls through to the linear-chain emission below. *) + let maybe_plan : Dispatch.plan option = + if List.for_all (fun {it=({pat; _} : case'); _} -> + Option.is_some (flatten_tag_leaves pat)) cs then + Some (Dispatch.with_handler (fun () -> + List.iter (fun {it=({pat; _} : case'); _} -> + let [@warning "-8"] Some leaves = flatten_tag_leaves pat in + let hashes = List.map (fun (l, _) -> Variant.hash_variant_label env l) leaves in + Effect.perform (Dispatch.Match_arm hashes) + ) cs; + Effect.perform Dispatch.Match_join)) + else None + in + + (match maybe_plan with + | Some ((Dispatch.MaskShift _ | Dispatch.ModPrime _) as plan) -> + (* Per case: compile the body once (using the first leaf's sub-pattern + — Motoko or-pattern typing ensures all legs bind the same variables). *) + let cases = List.map (fun {it={pat; exp=arm_exp}; _} -> + let [@warning "-8"] Some leaves = flatten_tag_leaves pat in + let [@warning "-8"] (_, first_sub_pat) :: _ = leaves in + let ae1, pat_code = compile_pat_local env ae {pat with it = known_tag_pat first_sub_pat} in + let sr, rhs_code = compile_exp_with_hint env ae1 sr_hint arm_exp in + (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code) + ) cs in + + let n_cases = List.length cases in + + let final_sr = match sr_hint with + | Some sr -> sr + | None -> StackRep.joins (List.map fst cases) + in + + (* Build the dispatch prologue + br_table per plan. Both strategies + arrive at the same (case_idx, default) br_table interface — only + the index-computation prologue differs. *) + let dispatch_code = + let prologue_plus_table = + match plan with + | Dispatch.MaskShift { mask; shift; table_size; slot_for_case } -> + compile_bitand_const mask ^^ + (if shift > 0 then compile_shrU_const (Int64.of_int shift) else G.nop) ^^ + G.i (Convert (Wasm_exts.Values.I32 I32Op.WrapI64)) ^^ + G.i (BrTable ( + List.init table_size (fun j -> nr (Int32.of_int slot_for_case.(j))), + nr (Int32.of_int n_cases) (* default: unreachable *) + )) + | Dispatch.ModPrime { p; case_for_residue } -> + compile_unboxed_const (Int64.of_int p) ^^ + G.i (Binary (Wasm_exts.Values.I64 I64Op.RemU)) ^^ + G.i (Convert (Wasm_exts.Values.I32 I32Op.WrapI64)) ^^ + G.i (BrTable ( + List.init p (fun r -> nr (Int32.of_int case_for_residue.(r))), + nr (Int32.of_int n_cases) + )) + | Dispatch.Linear -> assert false + in + get_i ^^ Variant.get_variant_tag env ^^ prologue_plus_table + in + + final_sr, + code1 ^^ set_i ^^ + FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code -> + + (* Arm body codes: sub-pattern match + rhs + SR-adjust + exit. + On sub-pattern failure (impossible for well-typed code): trap. *) + let arm_body_codes = List.map (fun (sr, c) -> + with_fail (G.i Unreachable) + (c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code)) + ) cases in + + (* Build nested blocks from inside out: + block_default { block_case_{k-1} { ... block_case_0 { dispatch } + body_0 ... } body_{k-1} } unreachable + Inside dispatch: label k -> case k, label n_cases -> default. + fold starts with dispatch (not an extra wrapper), so case_0 is label 0. *) + let with_arms = List.fold_left (fun acc body_code -> + G.block0 acc ^^ body_code + ) dispatch_code arm_body_codes in + G.block0 with_arms ^^ + G.i Unreachable + ) + + | Some Dispatch.Linear | None -> + (* Linear hash-compare chain (default). *) + let codes = List.map (fun {it={pat; exp=e}; _} -> + let (ae1, pat_code) = compile_pat_local env ae pat in + let (sr, rhs_code) = compile_exp_with_hint env ae1 sr_hint e in + (sr, CannotFail get_i ^^^ pat_code ^^^ CannotFail rhs_code) ) (simplify_cases e cs) in - (* Use the expected stackrep, if given, else infer from the branches *) - let final_sr = match sr_hint with - | Some sr -> sr - | None -> StackRep.joins (List.map fst codes) - in + let final_sr = match sr_hint with + | Some sr -> sr + | None -> StackRep.joins (List.map fst codes) + in - final_sr, - (* Run scrut *) - code1 ^^ set_i ^^ - (* Run rest in block to exit from *) - FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code -> - orsPatternFailure env (List.map (fun (sr, c) -> - c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code) - ) codes) ^^ - G.i Unreachable (* We should always exit using the branch_code *) - ) + final_sr, + code1 ^^ set_i ^^ + FakeMultiVal.block_ env (StackRep.to_block_type env final_sr) (fun branch_code -> + orsPatternFailure env (List.map (fun (sr, c) -> + c ^^^ CannotFail (StackRep.adjust env sr final_sr ^^ branch_code) + ) codes) ^^ + G.i Unreachable (* We should always exit using the branch_code *) + )) (* Async-wait lowering support features *) | DeclareE (name, typ, e) -> let ae1, i = VarEnv.add_local_with_heap_ind env ae name typ in diff --git a/test/bench/ok/variant-switch.drun-run.ok b/test/bench/ok/variant-switch.drun-run.ok index dfa3aef2a7c..9ded029edea 100644 --- a/test/bench/ok/variant-switch.drun-run.ok +++ b/test/bench/ok/variant-switch.drun-run.ok @@ -1,9 +1,9 @@ ingress Completed: Reply: 0x4449444c016c01b3c4b1f204680100010a00000000000000000101 ingress Completed: Reply: 0x4449444c0000 -debug.print: {heap_diff = 0; instr_diff = 137_590_321; total = 1_090_000} +debug.print: {heap_diff = 0; instr_diff = 95_690_321; total = 1_090_000} ingress Completed: Reply: 0x4449444c0000 -debug.print: {fib7_eval = 13; fib7_evalFT = 13; fib7_xform = 13; instr_eval = 24_509_348; instr_evalFT = 21_536_248; instr_transform = 1_189_148} +debug.print: {fib7_eval = 13; fib7_evalFT = 13; fib7_xform = 13; instr_eval = 22_051_348; instr_evalFT = 21_536_248; instr_transform = 1_057_948} ingress Completed: Reply: 0x4449444c0000 -debug.print: {acc1 = 20_000; acc2 = 20_000; instr_isWeekend = 10_010_321; instr_isWeekendOr = 11_050_321} +debug.print: {acc1 = 20_000; acc2 = 20_000; instr_isWeekend = 7_070_321; instr_isWeekendOr = 7_560_321; instr_startLetter = 6_270_321; instr_startLetterOr = 6_760_321; last3 = 'S'; last4 = 'S'} ingress Completed: Reply: 0x4449444c0000 ingress Completed: Reply: 0x4449444c0000 diff --git a/test/bench/variant-switch.mo b/test/bench/variant-switch.mo index 9e5a935cae1..2e2a8b07ac2 100644 --- a/test/bench/variant-switch.mo +++ b/test/bench/variant-switch.mo @@ -120,6 +120,30 @@ persistent actor Core { case (#Sat or #Sun) true; }; + // 7-way distinct-outcome dispatch (richer than the 2-way isWeekend Bool). + // Bodies include two natural same-body groups: {Tue, Thu} → 'T' and + // {Sat, Sun} → 'S'. Useful to compare flat-arm vs or-pattern compilation + // when the payload is varied rather than collapsed to a Bool. + func startLetter(d : Weekday) : Char = + switch d { + case (#Mon) 'M'; + case (#Tue) 'T'; + case (#Wed) 'W'; + case (#Thu) 'T'; + case (#Fri) 'F'; + case (#Sat) 'S'; + case (#Sun) 'S'; + }; + + func startLetterOr(d : Weekday) : Char = + switch d { + case (#Mon) 'M'; + case (#Tue or #Thu) 'T'; + case (#Wed) 'W'; + case (#Fri) 'F'; + case (#Sat or #Sun) 'S'; + }; + transient let week : [Weekday] = [#Mon, #Tue, #Wed, #Thu, #Fri, #Sat, #Sun]; @@ -326,10 +350,26 @@ persistent actor Core { j += 1; }; let (_m2, n2) = counters(); + var last3 : Char = 'X'; + var k = 0; + while (k < 10_000) { + for (d in week.vals()) { last3 := startLetter d }; + k += 1; + }; + let (_m3, n3) = counters(); + var last4 : Char = 'X'; + var l = 0; + while (l < 10_000) { + for (d in week.vals()) { last4 := startLetterOr d }; + l += 1; + }; + let (_m4, n4) = counters(); debugPrint(debug_show { - acc1; acc2; - instr_isWeekend = n1 - n0; - instr_isWeekendOr = n2 - n1; + acc1; acc2; last3; last4; + instr_isWeekend = n1 - n0; + instr_isWeekendOr = n2 - n1; + instr_startLetter = n3 - n2; + instr_startLetterOr = n4 - n3; }); }; }; diff --git a/test/run/variant_switch.mo b/test/run/variant_switch.mo new file mode 100644 index 00000000000..e6f9e360323 --- /dev/null +++ b/test/run/variant_switch.mo @@ -0,0 +1,60 @@ +// Test masked br_table dispatch for variant switches (4+ arms) + +type Color = { #red; #green; #blue; #yellow }; + +func colorName(c : Color) : Text = + switch c { + case (#red) "red"; + case (#green) "green"; + case (#blue) "blue"; + case (#yellow) "yellow"; + }; + +assert (colorName (#red) == "red"); +assert (colorName (#green) == "green"); +assert (colorName (#blue) == "blue"); +assert (colorName (#yellow) == "yellow"); + +type Weekday = { #Mon; #Tue; #Wed; #Thu; #Fri; #Sat; #Sun }; + +func isWeekend(d : Weekday) : Bool = + switch d { + case (#Mon) false; + case (#Tue) false; + case (#Wed) false; + case (#Thu) false; + case (#Fri) false; + case (#Sat) true; + case (#Sun) true; + }; + +assert (not (isWeekend (#Mon))); +assert (not (isWeekend (#Fri))); +assert (isWeekend (#Sat)); +assert (isWeekend (#Sun)); + +// Same dispatch expressed with or-patterns — exercises same-body arm merging +func isWeekendOr(d : Weekday) : Bool = + switch d { + case (#Mon or #Tue or #Wed or #Thu or #Fri) false; + case (#Sat or #Sun) true; + }; + +assert (not (isWeekendOr (#Mon))); +assert (not (isWeekendOr (#Fri))); +assert (isWeekendOr (#Sat)); +assert (isWeekendOr (#Sun)); + +// Variant with payload — sub-pattern binding still works +type Shape = { #circle : Float; #rect : (Float, Float); #tri : Float; #dot : () }; + +func area(s : Shape) : Float = + switch s { + case (#circle r) r * r * 3.14159; + case (#rect(w, h)) w * h; + case (#tri b) b * b * 0.5; + case (#dot _) 0.0; + }; + +assert (area (#dot ()) == 0.0); +assert (area (#rect(3.0, 4.0)) == 12.0);