diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index bebaf4b8e..ecc3a53dd 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,6 +63,9 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
+# Enable multi-vector MaxSim distance benchmarks
+multi-vector = []
+
 # Enable Disk Index benchmarks
 disk-index = [
     "diskann-disk/perf_test",
diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json
new file mode 100644
index 000000000..af66a886d
--- /dev/null
+++ b/diskann-benchmark/example/multi-vector.json
@@ -0,0 +1,47 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
new file mode 100644
index 000000000..8d5997199
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
@@ -0,0 +1,16 @@
+{
+  "checks": [
+    {
+      "input": {
+        "type": "multi-vector-op",
+        "content": {}
+      },
+      "tolerance": {
+        "type": "multi-vector-tolerance",
+        "content": {
+          "min_time_regression": 0.05
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json
new file mode 100644
index 000000000..c4ce9bb8b
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector.json
@@ -0,0 +1,149 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs
index 8396577e8..d04bae158 100644
--- a/diskann-benchmark/src/backend/mod.rs
+++ b/diskann-benchmark/src/backend/mod.rs
@@ -9,11 +9,13 @@ mod disk_index;
 mod exhaustive;
 mod filters;
 mod index;
+mod multi_vector;
 
 pub(crate) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
     exhaustive::register_benchmarks(registry)?;
     disk_index::register_benchmarks(registry)?;
     index::register_benchmarks(registry)?;
     filters::register_benchmarks(registry)?;
+    multi_vector::register_benchmarks(registry)?;
     Ok(())
 }
diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
new file mode 100644
index 000000000..57446ae9b
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Shared benchmark infrastructure for multi-vector kernels.
+//!
+//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result
+//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object
+//! [`Distance<T>`] boundary the driver dispatches through. None of the
+//! contents are kernel-aware.
+
+use diskann_benchmark_runner::{
+    utils::{
+        fmt::Table,
+        num::{relative_change, NonNegativeFinite},
+        percentiles, MicroSeconds,
+    },
+    Checker, Input,
+};
+use diskann_quantization::multi_vector::{Mat, MatRef, MaxSimKernel, Standard};
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::multi_vector::Run;
+use crate::utils::DisplayWrapper;
+
+//////////////////////
+// Tolerance        //
+//////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub(super) struct MultiVectorTolerance {
+    pub(super) min_time_regression: NonNegativeFinite,
+}
+
+impl Input for MultiVectorTolerance {
+    type Raw = Self;
+
+    fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+
+    fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result<Self> {
+        Ok(raw)
+    }
+
+    fn serialize(&self) -> anyhow::Result<serde_json::Value> {
+        Ok(serde_json::to_value(self)?)
+    }
+
+    fn example() -> Self {
+        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
+            Ok(v) => v,
+            Err(_) => panic!("use a non-negative finite please"),
+        };
+
+        MultiVectorTolerance {
+            min_time_regression: EXAMPLE,
+        }
+    }
+}
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+/// Random query / doc fixture for a single benchmark run.
+pub(super) struct Data<T: Copy> {
+    pub(super) queries: Mat<Standard<T>>,
+    pub(super) docs: Mat<Standard<T>>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    pub(super) fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(0x12345);
+        let queries = Mat::from_fn(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        let docs = Mat::from_fn(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        Self { queries, docs }
+    }
+}
+
+//////////////////////
+// Distance trait   //
+//////////////////////
+
+/// Object-safe distance executor. The library factory's `Erase` visitor
+/// already produces a `Box<dyn MaxSimKernel<T>>`, but the driver wants its
+/// own narrow trait so the kernel + its assertions are tucked inside one
+/// vtable boundary. Simpler than threading `Box<dyn MaxSimKernel<T>>`
+/// generically through the timing harness.
+pub(super) trait Distance<T: Copy> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// Distance executor wrapping a boxed `MaxSimKernel<T>` from the library
+/// factory. One vtable hop in the hot loop.
+pub(super) struct BoxedKernel<T: Copy>(pub(super) Box<dyn MaxSimKernel<T>>);
+
+impl<T: Copy> Distance<T> for BoxedKernel<T> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        let nq = self.0.nrows();
+        assert_eq!(
+            scores.len(),
+            nq,
+            "scores buffer not right size: {} != {}",
+            scores.len(),
+            nq
+        );
+        if doc.num_vectors() == 0 {
+            return;
+        }
+        self.0.compute_max_sim(doc, scores);
+    }
+}
+
+//////////////////////
+// Timing harness   //
+//////////////////////
+
+fn run_loops(run: &Run, body: &mut dyn FnMut()) -> RunResult {
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+/// Shared loop nest. The trait-object dispatch happens once per outer iteration
+/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the
+/// vtable hop is in the noise.
+pub(super) fn run_with_distance<T: Copy>(
+    run: &Run,
+    doc: MatRef<'_, Standard<T>>,
+    dist: &dyn Distance<T>,
+) -> RunResult {
+    let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+    run_loops(run, &mut || {
+        dist.max_sim(doc, &mut scores);
+        std::hint::black_box(&mut scores);
+    })
+}
+
+//////////////////////
+// Result types     //
+//////////////////////
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct RunResult {
+    /// The configuration for this run.
+    pub(super) run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    pub(super) latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    pub(super) percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    pub(super) fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.num_query_vectors, 0);
+            row.insert(r.run.num_doc_vectors, 1);
+            row.insert(r.run.dim, 2);
+            row.insert(format!("{:.3}", min_time), 3);
+            row.insert(format!("{:.3}", mean_time), 4);
+            row.insert(r.run.loops_per_measurement, 5);
+            row.insert(r.run.num_measurements, 6);
+        });
+
+        table.fmt(f)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+pub(super) struct Comparison {
+    pub(super) run: Run,
+    pub(super) tolerance: MultiVectorTolerance,
+    pub(super) before_min: f64,
+    pub(super) after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+pub(super) struct CheckResult {
+    pub(super) checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.num_query_vectors, 0);
+            row.insert(c.run.num_doc_vectors, 1);
+            row.insert(c.run.dim, 2);
+            row.insert(format!("{:.3}", c.before_min), 3);
+            row.insert(format!("{:.3}", c.after_min), 4);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 5);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 6);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 5);
+                    row.insert(err, 6);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/kernels.rs b/diskann-benchmark/src/backend/multi_vector/kernels.rs
new file mode 100644
index 000000000..cc4e63b89
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/kernels.rs
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! `Benchmark` impls for the multi-vector MaxSim factory.
+//!
+//! A single generic [`Kernel<T>`] carrier supplies the `Benchmark` and
+//! `Regression` impls for every element type accepted by the library's
+//! [`MaxSimElement`] sealed trait. Each `try_match` checks `element_type`
+//! only; the JSON `isa` field is passed to the library factory at run time,
+//! and arch unavailability surfaces as a job-level error via
+//! [`NotSupported`](diskann_quantization::multi_vector::NotSupported).
+
+use std::io::Write;
+use std::marker::PhantomData;
+
+use diskann_benchmark_runner::{
+    benchmark::{FailureScore, MatchScore, PassFail, Regression},
+    utils::{datatype::AsDataType, num::relative_change},
+    Benchmark, Checkpoint, Output, Registry,
+};
+use diskann_quantization::multi_vector::{build_max_sim, BoxErase, MaxSimElement};
+use rand::distr::{Distribution, StandardUniform};
+
+use super::driver::{
+    run_with_distance, BoxedKernel, CheckResult, Comparison, Data, MultiVectorTolerance, RunResult,
+};
+use crate::inputs::multi_vector::MultiVectorOp;
+use crate::utils::DisplayWrapper;
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Kernel<T> — generic carrier registered once per element type.
+// ─────────────────────────────────────────────────────────────────────────
+
+#[derive(Debug)]
+pub(super) struct Kernel<T>(PhantomData<T>);
+
+impl<T> Kernel<T> {
+    pub(super) const fn new() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<T> Benchmark for Kernel<T>
+where
+    T: MaxSimElement + AsDataType,
+    StandardUniform: Distribution<T>,
+{
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
+
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        crate::utils::match_data_type::<T>(from.element_type)
+    }
+
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: Checkpoint<'_>,
+        mut output: &mut dyn Output,
+    ) -> anyhow::Result<Self::Output> {
+        writeln!(output, "{}", input)?;
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let kernel = build_max_sim::<T, _>(input.isa.into(), data.queries.as_view(), BoxErase)?;
+            let dist = BoxedKernel(kernel);
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
+
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => writeln!(f, "- Element Type: {}", <T as AsDataType>::DATA_TYPE)?,
+            Some(input) => {
+                let desc = <T as AsDataType>::describe(input.element_type);
+                if !desc.is_match() {
+                    writeln!(f, "\n    - Mismatched element type: {}", desc)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<T> Regression for Kernel<T>
+where
+    T: MaxSimElement + AsDataType,
+    StandardUniform: Distribution<T>,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        Ok(if passed {
+            PassFail::Pass(CheckResult { checks })
+        } else {
+            PassFail::Fail(CheckResult { checks })
+        })
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Registration.
+// ─────────────────────────────────────────────────────────────────────────
+
+pub(super) fn register(registry: &mut Registry) -> anyhow::Result<()> {
+    registry.register_regression("multi-vector-op-f32", Kernel::<f32>::new())?;
+    registry.register_regression("multi-vector-op-f16", Kernel::<half::f16>::new())?;
+    Ok(())
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs
new file mode 100644
index 000000000..2cbb2d9a6
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/mod.rs
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector MaxSim distance benchmarks with regression detection.
+//!
+//! Registers one `Benchmark` entry per supported element type; the JSON
+//! `isa` field selects the kernel at run time via the library's
+//! [`build_max_sim`] factory. The set of accepted element types is gated by
+//! the sealed [`MaxSimElement`] trait.
+//!
+//! # Adding a new in-tree experimental kernel
+//!
+//! 1. **Library: variant + dispatch arm.** In
+//!    `diskann-quantization::multi_vector::distance`:
+//!    - Add a new variant to [`MaxSimIsa`] (in `isa.rs`).
+//!    - Implement [`MaxSimKernel<T>`] for your kernel struct (in
+//!      `factory.rs`, next to `Prepared` and `ReferenceKernel`).
+//!    - Add a matching arm to the [`MaxSimElement::build`] impl for each
+//!      element type your kernel supports — the arm constructs your kernel
+//!      and hands it to `erase.erase(...)`.
+//!
+//! 2. **Benchmark: matching shadow variant.** In
+//!    [`crate::inputs::multi_vector`]:
+//!    - Add the same variant to [`BenchIsa`].
+//!    - Add the matching arm to `From<BenchIsa> for MaxSimIsa`.
+//!
+//! 3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing
+//!    `Kernel<T>` benchmark entries (registered once per element type)
+//!    handle the rest. No new `Benchmark` registration required.
+//!
+//! # Why two enums?
+//!
+//! [`MaxSimIsa`] (library) and [`BenchIsa`] are kept separate so the library
+//! doesn't pin its public API on a serde version or a particular JSON
+//! shape. The benchmark owns its kebab-case JSON layout; the library is
+//! serde-agnostic. Mirroring variant-for-variant is intentional — small
+//! price for keeping the library boundary clean.
+//!
+//! # Background
+//!
+//! The factory follows the BYOTE ("Bring your own type erasure") pattern
+//! described in [RFC #1068]. If you want your kernel packaged as something
+//! other than `Box<dyn MaxSimKernel<T>>` (e.g. composed with chamfer
+//! summing, or wrapped in a custom thin trait), implement your own
+//! [`Erase<T>`] and pass it to the factory in place of [`BoxErase`].
+//!
+//! [`build_max_sim`]: diskann_quantization::multi_vector::build_max_sim
+//! [`MaxSimIsa`]: diskann_quantization::multi_vector::MaxSimIsa
+//! [`MaxSimElement`]: diskann_quantization::multi_vector::MaxSimElement
+//! [`MaxSimElement::build`]: diskann_quantization::multi_vector::MaxSimElement::build
+//! [`MaxSimKernel<T>`]: diskann_quantization::multi_vector::MaxSimKernel
+//! [`Erase<T>`]: diskann_quantization::multi_vector::Erase
+//! [`BoxErase`]: diskann_quantization::multi_vector::BoxErase
+//! [`BenchIsa`]: crate::inputs::multi_vector::BenchIsa
+//! [RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068
+
+use diskann_benchmark_runner::Registry;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "multi-vector")] {
+        mod driver;
+        mod kernels;
+
+        pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
+            kernels::register(registry)
+        }
+    } else {
+        crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+        pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
+            imp::register("multi-vector-op", registry)
+        }
+    }
+}
+
+#[cfg(all(test, feature = "multi-vector"))]
+mod tests {
+    use std::num::NonZeroUsize;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::{
+            datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles,
+            MicroSeconds,
+        },
+    };
+
+    use super::driver::{CheckResult, Comparison, MultiVectorTolerance, RunResult};
+    use super::kernels::Kernel;
+    use crate::inputs::multi_vector::{BenchIsa, MultiVectorOp, Run};
+
+    fn tiny_run() -> Run {
+        Run {
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            isa: BenchIsa::Auto,
+            runs: vec![tiny_run()],
+        }
+    }
+
+    fn tiny_result(minimum: u64) -> RunResult {
+        let mut latencies = vec![MicroSeconds::new(minimum)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: tiny_run(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = Kernel::<f32>::new();
+
+        // Build a result whose `run` diverges from `tiny_run()` so the
+        // regression check's `b.run == a.run` invariant fires.
+        let mut latencies = vec![MicroSeconds::new(100)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        let mismatched_result = RunResult {
+            run: Run {
+                num_query_vectors: NonZeroUsize::new(4).unwrap(),
+                ..tiny_run()
+            },
+            latencies,
+            percentiles,
+        };
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![mismatched_result],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = Kernel::<f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = Kernel::<f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = Kernel::<f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Q"), "rendered = {rendered}");
+        assert!(rendered.contains("Dim"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = Kernel::<f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(0)],
+                &vec![tiny_result(0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    //////////////////////
+    // BoxedKernel      //
+    //////////////////////
+    //
+    // The library's `MaxSimKernel<T>` trait makes no zero-doc / size-assert
+    // guarantees — those contracts live on the `BoxedKernel<T>` wrapper in
+    // `driver.rs`. The tests below pin that wrapper's behaviour.
+
+    use super::driver::{BoxedKernel, Distance};
+    use diskann_quantization::multi_vector::{
+        build_max_sim, BoxErase, MatRef as LibMatRef, MaxSimIsa, Standard as LibStandard,
+    };
+
+    fn boxed_kernel_f32_two_rows() -> BoxedKernel<f32> {
+        let data = [1.0f32, 0.0, 0.0, 1.0];
+        let query = LibMatRef::new(LibStandard::new(2, 2).unwrap(), data.as_slice()).unwrap();
+        BoxedKernel(build_max_sim::<f32, _>(MaxSimIsa::Auto, query, BoxErase).unwrap())
+    }
+
+    #[test]
+    fn boxed_kernel_max_sim_with_zero_docs_leaves_scores_untouched() {
+        let kernel = boxed_kernel_f32_two_rows();
+        let empty: [f32; 0] = [];
+        let doc = LibMatRef::new(LibStandard::new(0, 2).unwrap(), empty.as_slice()).unwrap();
+        let mut scores = vec![0.0f32; 2];
+        kernel.max_sim(doc, &mut scores);
+        for &s in &scores {
+            assert_eq!(s, 0.0, "zero-doc max_sim should leave scores untouched");
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "scores buffer not right size")]
+    fn boxed_kernel_max_sim_panics_on_size_mismatch() {
+        let kernel = boxed_kernel_f32_two_rows();
+        let doc_data = [1.0f32, 1.0];
+        let doc = LibMatRef::new(LibStandard::new(1, 2).unwrap(), doc_data.as_slice()).unwrap();
+        let mut scores = vec![0.0f32; 3]; // Wrong size: 3 vs kernel's nrows() = 2.
+        kernel.max_sim(doc, &mut scores);
+    }
+}
diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs
index 492f0b9c1..0d429c0c5 100644
--- a/diskann-benchmark/src/inputs/mod.rs
+++ b/diskann-benchmark/src/inputs/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod disk;
 pub(crate) mod exhaustive;
 pub(crate) mod filters;
 pub(crate) mod graph_index;
+pub(crate) mod multi_vector;
 pub(crate) mod save_and_load;
 
 /// Construct an example input of type `Self`.
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
new file mode 100644
index 000000000..cbb1c255b
--- /dev/null
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use std::num::NonZeroUsize;
+
+use diskann_benchmark_runner::{utils::datatype::DataType, Checker, Input};
+use diskann_quantization::multi_vector::MaxSimIsa;
+use serde::{Deserialize, Serialize};
+
+////////////////
+// Enum types //
+////////////////
+
+/// JSON-facing shadow of [`MaxSimIsa`] from `diskann-quantization`. The
+/// library's enum is deliberately not `Serialize`/`Deserialize` so it isn't
+/// pinned to a particular JSON shape; this enum owns the kebab-case
+/// serialization and converts to the library type at dispatch time.
+///
+/// **Stays in sync with `MaxSimIsa` manually.** When the library adds a
+/// variant, mirror it here + add a matching arm to `From<BenchIsa> for
+/// MaxSimIsa`.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+#[non_exhaustive]
+pub(crate) enum BenchIsa {
+    #[serde(rename = "x86-64-v4")]
+    #[allow(non_camel_case_types)]
+    X86_64_V4,
+    #[serde(rename = "x86-64-v3")]
+    #[allow(non_camel_case_types)]
+    X86_64_V3,
+    Neon,
+    Scalar,
+    Reference,
+    Auto,
+}
+
+impl std::fmt::Display for BenchIsa {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::Neon => "neon",
+            Self::Scalar => "scalar",
+            Self::Reference => "reference",
+            Self::Auto => "auto",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+impl From<BenchIsa> for MaxSimIsa {
+    fn from(b: BenchIsa) -> Self {
+        match b {
+            BenchIsa::X86_64_V4 => MaxSimIsa::X86_64_V4,
+            BenchIsa::X86_64_V3 => MaxSimIsa::X86_64_V3,
+            BenchIsa::Neon => MaxSimIsa::Neon,
+            BenchIsa::Scalar => MaxSimIsa::Scalar,
+            BenchIsa::Reference => MaxSimIsa::Reference,
+            BenchIsa::Auto => MaxSimIsa::Auto,
+        }
+    }
+}
+
+/// One benchmark configuration: a single shape measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) struct Run {
+    pub(crate) num_query_vectors: NonZeroUsize,
+    pub(crate) num_doc_vectors: NonZeroUsize,
+    pub(crate) dim: NonZeroUsize,
+    pub(crate) loops_per_measurement: NonZeroUsize,
+    pub(crate) num_measurements: NonZeroUsize,
+}
+
+///////////////////////
+// Multi-Vector Op   //
+///////////////////////
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct MultiVectorOp {
+    pub(crate) element_type: DataType,
+    pub(crate) isa: BenchIsa,
+    pub(crate) runs: Vec<Run>,
+}
+
+impl MultiVectorOp {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+}
+
+impl Input for MultiVectorOp {
+    type Raw = Self;
+
+    fn tag() -> &'static str {
+        Self::tag()
+    }
+
+    fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result<Self> {
+        Ok(raw)
+    }
+
+    fn serialize(&self) -> anyhow::Result<serde_json::Value> {
+        Ok(serde_json::to_value(self)?)
+    }
+
+    fn example() -> Self {
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                num_query_vectors: NonZeroUsize::new(32).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                num_query_vectors: NonZeroUsize::new(64).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Self {
+            element_type: DataType::Float32,
+            isa: BenchIsa::Auto,
+            runs,
+        }
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "isa", self.isa)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index cc70120cd..c87a08e17 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -772,6 +772,92 @@ mod tests {
         assert!(!output_path.exists());
     }
 
+    ///////////////////
+    // Multi-Vector  //
+    ///////////////////
+
+    #[test]
+    fn multi_vector_integration() {
+        let path = example_directory().join("multi-vector.json");
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+        assert!(!output_path.exists());
+
+        let modified_input_path = tempdir.path().join("input.json");
+
+        let mut raw = value_from_file(&path);
+        prefix_search_directories(&mut raw, &root_directory());
+        save_to_file(&modified_input_path, &raw);
+
+        run_multi_vector_integration(&modified_input_path, &output_path)
+    }
+
+    #[cfg(feature = "multi-vector")]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        // Check that the results file is generated.
+        assert!(output_path.exists());
+    }
+
+    #[cfg(not(feature = "multi-vector"))]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        let err = cli.run(&mut output).unwrap_err();
+        println!("err = {:?}", err);
+
+        let output = String::from_utf8(output.into_inner()).unwrap();
+        assert!(output.contains("\"multi-vector\" feature"));
+        println!("output = {}", output);
+
+        // The output file should not have been created because we failed the test.
+        assert!(!output_path.exists());
+    }
+
+    #[test]
+    #[cfg(feature = "multi-vector")]
+    fn multi_vector_check_verify() {
+        let input_path = example_directory().join("multi-vector.json");
+        let tolerance_path = project_directory()
+            .join("perf_test_inputs")
+            .join("multi-vector-tolerance.json");
+
+        let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify {
+            tolerances: tolerance_path,
+            input_file: input_path,
+        });
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+    }
+
     #[test]
     fn quiet_suppresses_check_target_warning() {
         let cli = Cli::from_commands(Commands::Skeleton, true);
diff --git a/diskann-quantization/src/multi_vector/distance/factory.rs b/diskann-quantization/src/multi_vector/distance/factory.rs
new file mode 100644
index 000000000..0bfe82fc1
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/factory.rs
@@ -0,0 +1,545 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Factory + concrete `MaxSimKernel<T>` implementations for the multi-vector
+//! distance API. See [`build_max_sim`] for the BYOTE entry point and
+//! [`MaxSimElement`] for the sealed trait that gates accepted element types.
+
+use diskann_utils::Reborrow;
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use diskann_wide::Architecture;
+use diskann_wide::arch::Scalar;
+#[cfg(target_arch = "aarch64")]
+use diskann_wide::arch::aarch64::Neon;
+#[cfg(target_arch = "x86_64")]
+use diskann_wide::arch::x86_64::{V3, V4};
+
+use super::isa::{MaxSimIsa, NotSupported};
+use super::kernel::{Erase, MaxSimKernel};
+use super::kernels::f16::F16Entry;
+use super::kernels::f32::F32Kernel;
+use super::max_sim::MaxSim;
+use crate::multi_vector::distance::QueryMatRef;
+use crate::multi_vector::{BlockTransposed, BlockTransposedRef, Mat, MatRef, Standard};
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Prepared<A, Q> — concrete kernel for the arch-dispatched paths.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Concrete kernel: owns an arch token and a block-transposed prepared query.
+/// One generic `MaxSimKernel<T>` impl covers every arch (Scalar/V3/V4/Neon)
+/// for every supported element type (f32, f16) via the `Kernel<A>` / `Target3`
+/// dispatch in the `kernels` module.
+#[derive(Debug)]
+struct Prepared<A, Q> {
+    arch: A,
+    prepared: Q,
+}
+
+impl<A, const GROUP: usize> MaxSimKernel<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
+where
+    A: Architecture,
+    F32Kernel<GROUP>: for<'a> diskann_wide::arch::Target3<
+            A,
+            (),
+            BlockTransposedRef<'a, f32, GROUP>,
+            MatRef<'a, Standard<f32>>,
+            &'a mut [f32],
+        >,
+{
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        self.arch.run3(
+            F32Kernel::<GROUP>,
+            self.prepared.reborrow(),
+            doc,
+            &mut scratch,
+        );
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+impl<A, const GROUP: usize> MaxSimKernel<half::f16>
+    for Prepared<A, BlockTransposed<half::f16, GROUP>>
+where
+    A: Architecture,
+    F16Entry<GROUP>: for<'a> diskann_wide::arch::Target3<
+            A,
+            (),
+            BlockTransposedRef<'a, half::f16, GROUP>,
+            MatRef<'a, Standard<half::f16>>,
+            &'a mut [f32],
+        >,
+{
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<half::f16>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        self.arch.run3(
+            F16Entry::<GROUP>,
+            self.prepared.reborrow(),
+            doc,
+            &mut scratch,
+        );
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  ReferenceKernel<T> — non-SIMD fallback that wraps MaxSim::evaluate.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// `MaxSimIsa::Reference` path. Owns the query as a `Mat<Standard<T>>` and
+/// delegates to [`MaxSim`] per `compute_max_sim` call.
+struct ReferenceKernel<T: Copy> {
+    query: Mat<Standard<T>>,
+}
+
+impl<T: Copy + std::fmt::Debug> std::fmt::Debug for ReferenceKernel<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ReferenceKernel")
+            .field("nrows", &self.query.num_vectors())
+            .finish()
+    }
+}
+
+impl<T: Copy> ReferenceKernel<T> {
+    fn new(query: MatRef<'_, Standard<T>>) -> Self {
+        let repr = *query.repr();
+        let src = query.as_slice();
+        let mut idx = 0usize;
+        let owned = Mat::<Standard<T>>::from_fn(repr, || {
+            let v = src[idx];
+            idx += 1;
+            v
+        });
+        Self { query: owned }
+    }
+}
+
+impl<T> MaxSimKernel<T> for ReferenceKernel<T>
+where
+    T: Copy + Send + Sync + std::fmt::Debug + 'static,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+{
+    fn nrows(&self) -> usize {
+        self.query.num_vectors()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        if scores.is_empty() {
+            return;
+        }
+        let query: QueryMatRef<'_, Standard<T>> = self.query.as_view().into();
+        let Ok(mut max_sim) = MaxSim::new(scores) else {
+            return;
+        };
+        let _ = max_sim.evaluate(query, doc);
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  BuildAndErase<E> — Target1 impls used by `dispatch1_no_features` (Auto).
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Internal `Target1` carrier used by the `MaxSimIsa::Auto` arm of
+/// [`MaxSimElement::build`]. `dispatch1_no_features` picks the highest
+/// available arch on the host CPU and calls the matching `Target1::run`
+/// below.
+struct BuildAndErase<E>(E);
+
+// ───── f32 Target1 impls ─────
+
+impl<E: Erase<f32>> diskann_wide::arch::Target1<Scalar, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Scalar, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<V3, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V3, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        let prepared = BlockTransposed::<f32, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<V4, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V4, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        // V4 dispatches to V3 (no V4-specific kernel).
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<f32, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<Neon, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Neon, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        // Neon dispatches to Scalar (no Neon-specific kernel).
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+// ───── f16 Target1 impls ─────
+
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<Scalar, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Scalar, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let prepared = BlockTransposed::<half::f16, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<V3, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V3, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let prepared = BlockTransposed::<half::f16, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<V4, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V4, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        // V4 dispatches to V3 (no V4-specific kernel).
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<half::f16, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<Neon, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Neon, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        // Neon dispatches to Scalar (no Neon-specific kernel).
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<half::f16, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  MaxSimElement — sealed trait gating accepted element types.
+// ─────────────────────────────────────────────────────────────────────────
+
+mod sealed {
+    pub trait Sealed {}
+}
+
+/// Scalar element types accepted by the multi-vector MaxSim factory.
+///
+/// Sealed: external crates cannot add impls. The library ships impls for
+/// `f32` and `half::f16`. Quantized representations (PQ, SQ, packed sub-byte)
+/// do not fit this trait — they carry per-vector codebook/scale state and
+/// will get dedicated factory functions when they are added.
+pub trait MaxSimElement: sealed::Sealed + Sized + Copy + Send + Sync + 'static {
+    /// Build the concrete kernel for this element type and hand it to
+    /// `erase.erase(...)`. Returns [`NotSupported`] when the requested ISA
+    /// cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
+    fn build<E: Erase<Self>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<Self>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported>;
+}
+
+impl sealed::Sealed for f32 {}
+impl sealed::Sealed for half::f16 {}
+
+impl MaxSimElement for f32 {
+    fn build<E: Erase<f32>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<f32>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported> {
+        match isa {
+            MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+                BuildAndErase(erase),
+                query,
+            )),
+            MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V3 => {
+                let arch = V3::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX2/FMA unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V4 => {
+                let arch = V4::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX-512 unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "x86_64"))]
+            MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
+                isa,
+                reason: "x86_64 target only",
+            }),
+            #[cfg(target_arch = "aarch64")]
+            MaxSimIsa::Neon => {
+                let arch = Neon::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "Neon unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            MaxSimIsa::Neon => Err(NotSupported {
+                isa,
+                reason: "aarch64 target only",
+            }),
+            MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<f32>::new(query))),
+        }
+    }
+}
+
+impl MaxSimElement for half::f16 {
+    fn build<E: Erase<half::f16>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<half::f16>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported> {
+        match isa {
+            MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+                BuildAndErase(erase),
+                query,
+            )),
+            MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V3 => {
+                let arch = V3::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX2/FMA unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V4 => {
+                let arch = V4::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX-512 unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "x86_64"))]
+            MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
+                isa,
+                reason: "x86_64 target only",
+            }),
+            #[cfg(target_arch = "aarch64")]
+            MaxSimIsa::Neon => {
+                let arch = Neon::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "Neon unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            MaxSimIsa::Neon => Err(NotSupported {
+                isa,
+                reason: "aarch64 target only",
+            }),
+            MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<half::f16>::new(query))),
+        }
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Factory entry point.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Build a multi-vector MaxSim kernel for any [`MaxSimElement`] type.
+///
+/// Thin wrapper over [`MaxSimElement::build`] — exists so generic callers can
+/// write `build_max_sim::<T, _>(isa, query, erase)` without naming the trait
+/// at the call site. Returns [`NotSupported`] when the requested ISA cannot
+/// run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
+pub fn build_max_sim<T: MaxSimElement, E: Erase<T>>(
+    isa: MaxSimIsa,
+    query: MatRef<'_, Standard<T>>,
+    erase: E,
+) -> Result<E::Output, NotSupported> {
+    T::build(isa, query, erase)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::multi_vector::{BoxErase, Chamfer, MaxSim, QueryMatRef};
+
+    /// Local helper trait — picks a sane test value of `T` from an `f32`
+    /// so both `f32` and `half::f16` parameterizations share the same data
+    /// generator.
+    trait FromF32 {
+        fn from_f32(v: f32) -> Self;
+    }
+
+    impl FromF32 for f32 {
+        fn from_f32(v: f32) -> Self {
+            v
+        }
+    }
+
+    impl FromF32 for half::f16 {
+        fn from_f32(v: f32) -> Self {
+            diskann_wide::cast_f32_to_f16(v)
+        }
+    }
+
+    fn make_mat<T: Copy>(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard<T>> {
+        MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap()
+    }
+
+    fn make_test_data<T: FromF32>(len: usize, ceil: usize, shift: usize) -> Vec<T> {
+        (0..len)
+            .map(|v| T::from_f32(((v + shift) % ceil) as f32))
+            .collect()
+    }
+
+    /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback`
+    /// agreement checks: `(num_queries, num_docs, dim)`.
+    ///
+    /// Targets the factory wiring (query setup, score writeback) above the
+    /// kernel layer; exhaustive panel/remainder coverage is pinned in
+    /// `kernels::tiled_reduce::tests`.
+    const TEST_CASES: &[(usize, usize, usize)] = &[
+        (1, 1, 4),   // Degenerate
+        (5, 3, 5),   // Prime k; nq > 1 and nd > 1 exercise per-row writeback
+        (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths
+        (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2)
+    ];
+
+    fn check_chamfer_matches<T>(tol: f32, label: &str)
+    where
+        T: MaxSimElement + FromF32,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    {
+        for &(nq, nd, dim) in TEST_CASES {
+            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
+            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
+
+            let query = make_mat(&query_data, nq, dim);
+            let doc = make_mat(&doc_data, nd, dim);
+
+            let expected = Chamfer::evaluate(QueryMatRef::from(query), doc);
+
+            let kernel = build_max_sim::<T, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+            let mut scores = vec![0.0f32; nq];
+            kernel.compute_max_sim(doc, &mut scores);
+            let actual: f32 = scores.iter().sum();
+
+            assert!(
+                (actual - expected).abs() < tol,
+                "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}",
+            );
+        }
+    }
+
+    fn check_max_sim_matches<T>(tol: f32, label: &str)
+    where
+        T: MaxSimElement + FromF32,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    {
+        for &(nq, nd, dim) in TEST_CASES {
+            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
+            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
+
+            let query = make_mat(&query_data, nq, dim);
+            let doc = make_mat(&doc_data, nd, dim);
+
+            let mut expected_scores = vec![0.0f32; nq];
+            let _ = MaxSim::new(&mut expected_scores)
+                .unwrap()
+                .evaluate(QueryMatRef::from(query), doc);
+
+            let kernel = build_max_sim::<T, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+            let mut actual_scores = vec![0.0f32; nq];
+            kernel.compute_max_sim(doc, &mut actual_scores);
+
+            for i in 0..nq {
+                assert!(
+                    (actual_scores[i] - expected_scores[i]).abs() < tol,
+                    "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}",
+                    actual_scores[i],
+                    expected_scores[i],
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn dimensions_f32() {
+        let data = vec![1.0f32; 5 * 8];
+        let query = make_mat(&data, 5, 8);
+        let kernel = build_max_sim::<f32, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+        assert_eq!(kernel.nrows(), 5);
+    }
+
+    #[test]
+    fn dimensions_f16() {
+        let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8];
+        let query = make_mat(data.as_slice(), 5, 8);
+        let kernel = build_max_sim::<half::f16, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+        assert_eq!(kernel.nrows(), 5);
+    }
+
+    macro_rules! test_matches_fallback {
+        ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => {
+            mod $mod_name {
+                use super::*;
+
+                #[test]
+                fn chamfer_matches_fallback() {
+                    check_chamfer_matches::<$ty>($tol, $label);
+                }
+
+                #[test]
+                fn max_sim_matches_fallback() {
+                    check_max_sim_matches::<$ty>($tol, $label);
+                }
+            }
+        };
+    }
+
+    test_matches_fallback!(f32, f32, 1e-10, "f32 ");
+    test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
+}
diff --git a/diskann-quantization/src/multi_vector/distance/isa.rs b/diskann-quantization/src/multi_vector/distance/isa.rs
new file mode 100644
index 000000000..d4495dd55
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/isa.rs
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Instruction Set Architecture (ISA) selector for the multi-vector MaxSim
+//! factory.
+
+/// Instruction Set Architecture (ISA) selector for which multi-vector MaxSim
+/// kernel to build.
+///
+/// `#[non_exhaustive]` so adding a variant (e.g. for a new in-tree kernel) is
+/// not a breaking change. Deliberately **not** `Serialize`/`Deserialize` —
+/// callers wanting JSON support maintain their own shadow enum and convert
+/// via `From` / `TryFrom`, so the library is not pinned to a particular
+/// serialization format.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[allow(non_camel_case_types)]
+pub enum MaxSimIsa {
+    /// Pick the highest ISA the host CPU supports.
+    Auto,
+    /// Pure-scalar (emulated SIMD) kernel — always available.
+    Scalar,
+    /// x86_64 AVX2 + FMA.
+    X86_64_V3,
+    /// x86_64 AVX-512.
+    X86_64_V4,
+    /// AArch64 Neon.
+    Neon,
+    /// Non-SIMD reference fallback. Slow; serves as a correctness baseline.
+    Reference,
+}
+
+impl std::fmt::Display for MaxSimIsa {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Self::Auto => "auto",
+            Self::Scalar => "scalar",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::Neon => "neon",
+            Self::Reference => "reference",
+        };
+        f.write_str(s)
+    }
+}
+
+/// Returned by [`build_max_sim`](super::build_max_sim) when the requested
+/// ISA cannot be produced on the current host (e.g. x86_64 V4 requested on
+/// a non-AVX512 CPU, or Neon requested on x86_64).
+#[derive(Debug, Clone, Copy)]
+pub struct NotSupported {
+    pub isa: MaxSimIsa,
+    pub reason: &'static str,
+}
+
+impl std::fmt::Display for NotSupported {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} not supported: {}", self.isa, self.reason)
+    }
+}
+
+impl std::error::Error for NotSupported {}
diff --git a/diskann-quantization/src/multi_vector/distance/kernel.rs b/diskann-quantization/src/multi_vector/distance/kernel.rs
new file mode 100644
index 000000000..a2fd530d9
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/kernel.rs
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Object-safe kernel boundary trait plus BYOTE visitor trait.
+
+use crate::multi_vector::{MatRef, Standard};
+
+/// Object-safe interface for computing per-query MaxSim scores.
+///
+/// # Contract
+///
+/// - `scores.len() == self.nrows()` (caller's precondition).
+/// - The implementation must populate **all** `nrows()` entries of `scores`.
+///   Callers that derive quantities from the full score vector (e.g. sums)
+///   would silently corrupt their result if any trailing entry were left
+///   unwritten.
+pub trait MaxSimKernel<T: Copy>: Send + Sync + std::fmt::Debug {
+    /// Number of query rows whose scores this kernel produces.
+    fn nrows(&self) -> usize;
+
+    /// Compute per-query MaxSim scores against `doc` into `scores`.
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// "Bring your own type erasure" visitor. The factory hands an implementation
+/// to `erase`, which decides how to package / type-erase it. Lets different
+/// callers produce different output shapes (e.g. `Box<dyn MaxSimKernel<T>>`,
+/// a chamfer-only closure, a batched evaluator, ...) from the same factory.
+///
+/// See [`BoxErase`] for the default impl used by most callers.
+pub trait Erase<T: Copy> {
+    /// What the visitor produces.
+    type Output;
+    /// Visit the concrete kernel. `K` is generic so the body sees its concrete
+    /// type and the compiler can inline it into the wrapper.
+    fn erase<K: MaxSimKernel<T> + 'static>(self, kernel: K) -> Self::Output;
+}
+
+/// Default [`Erase`] impl: produces `Box<dyn MaxSimKernel<T>>`.
+///
+/// Use this when the caller just wants a heap-allocated kernel object behind
+/// a vtable. For custom packaging (chamfer-only, batched, composed), write
+/// your own `Erase` impl and pass it to the factory in place of `BoxErase`.
+#[derive(Debug, Clone, Copy)]
+pub struct BoxErase;
+
+impl<T: Copy + 'static> Erase<T> for BoxErase {
+    type Output = Box<dyn MaxSimKernel<T>>;
+
+    fn erase<K: MaxSimKernel<T> + 'static>(self, kernel: K) -> Self::Output {
+        Box::new(kernel)
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
index bd9121a24..55108698d 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -3,9 +3,8 @@
 
 //! Block-transposed SIMD kernels for multi-vector distance computation.
 //!
-//! This module provides a SIMD-accelerated implementation that uses block-transposed
-//! memory layout for **query** vectors (instead of documents), with documents remaining
-//! in row-major format.
+//! SIMD-accelerated implementation that uses block-transposed memory layout
+//! for **query** vectors, with documents remaining in row-major format.
 //!
 //! # Memory Layout
 //!
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index 853f60753..d4bc2725d 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -5,15 +5,17 @@
 //!
 //! Provides asymmetric distance primitives for multi-vector search:
 //!
-//! - [`MaxSim`]: Per-query-vector maximum similarities.
-//! - [`Chamfer`]: Sum of MaxSim scores (asymmetric Chamfer distance).
-//! - [`QueryComputer`]: Architecture-dispatched query computer backed by
-//!   SIMD-accelerated block-transposed kernels.
+//! - [`MaxSim`]: per-query-vector maximum similarities.
+//! - [`Chamfer`]: sum of MaxSim scores (asymmetric Chamfer distance).
+//! - [`MaxSimKernel`]: object-safe interface implemented by every concrete
+//!   kernel constructed through [`build_max_sim`].
+//! - [`Erase`]: BYOTE visitor — caller decides how to type-erase the kernel.
+//! - [`MaxSimElement`]: sealed trait gating which element types the factory
+//!   accepts.
 //!
 //! The fallback path uses a double-loop kernel over
-//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The optimised
-//! path (via [`QueryComputer`]) uses block-transposed layout with
-//! cache-tiled SIMD micro-kernels.
+//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The factory
+//! returns cache-tiled SIMD kernels selected by [`MaxSimIsa`].
 //!
 //! # Example
 //!
@@ -49,11 +51,15 @@
 //! // scores[1] =  0.0 (query[1] has no good match: max IP was 0)
 //! ```
 
+mod factory;
 mod fallback;
+mod isa;
+mod kernel;
 mod kernels;
 mod max_sim;
-mod query_computer;
 
+pub use factory::{MaxSimElement, build_max_sim};
 pub use fallback::QueryMatRef;
+pub use isa::{MaxSimIsa, NotSupported};
+pub use kernel::{BoxErase, Erase, MaxSimKernel};
 pub use max_sim::{Chamfer, MaxSim, MaxSimError};
-pub use query_computer::QueryComputer;
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
deleted file mode 100644
index 9bb348a6a..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-use diskann_wide::Architecture;
-use diskann_wide::arch::Scalar;
-#[cfg(target_arch = "aarch64")]
-use diskann_wide::arch::aarch64::Neon;
-#[cfg(target_arch = "x86_64")]
-use diskann_wide::arch::x86_64::{V3, V4};
-
-use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared};
-use crate::multi_vector::distance::kernels::f16::F16Entry;
-use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
-use diskann_utils::Reborrow;
-
-impl QueryComputer<half::f16> {
-    /// Build an f16 query computer, selecting the optimal architecture and
-    /// GROUP for the current CPU at runtime.
-    pub fn new(query: MatRef<'_, Standard<half::f16>>) -> Self {
-        diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
-    }
-}
-
-impl<A, const GROUP: usize> DynQueryComputer<half::f16>
-    for Prepared<A, BlockTransposed<half::f16, GROUP>>
-where
-    A: Architecture,
-    F16Entry<GROUP>: for<'a> diskann_wide::arch::Target3<
-            A,
-            (),
-            BlockTransposedRef<'a, half::f16, GROUP>,
-            MatRef<'a, Standard<half::f16>>,
-            &'a mut [f32],
-        >,
-{
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<half::f16>>, scores: &mut [f32]) {
-        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
-        self.arch.run3(
-            F16Entry::<GROUP>,
-            self.prepared.reborrow(),
-            doc,
-            &mut scratch,
-        );
-        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
-            *dst = -src;
-        }
-    }
-
-    fn nrows(&self) -> usize {
-        self.prepared.nrows()
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
-
-impl diskann_wide::arch::Target1<Scalar, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: Scalar, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 8>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V3, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: V3, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V4, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: V4, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-impl diskann_wide::arch::Target1<Neon, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: Neon, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 8>(arch, query)),
-        }
-    }
-}
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
deleted file mode 100644
index 9ff16b8b4..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-use diskann_wide::Architecture;
-use diskann_wide::arch::Scalar;
-#[cfg(target_arch = "aarch64")]
-use diskann_wide::arch::aarch64::Neon;
-#[cfg(target_arch = "x86_64")]
-use diskann_wide::arch::x86_64::{V3, V4};
-
-use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared};
-use crate::multi_vector::distance::kernels::f32::F32Kernel;
-use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
-use diskann_utils::Reborrow;
-
-impl QueryComputer<f32> {
-    /// Build an f32 query computer, selecting the optimal architecture and
-    /// GROUP for the current CPU at runtime.
-    pub fn new(query: MatRef<'_, Standard<f32>>) -> Self {
-        diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
-    }
-}
-
-impl<A, const GROUP: usize> DynQueryComputer<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
-where
-    A: Architecture,
-    F32Kernel<GROUP>: for<'a> diskann_wide::arch::Target3<
-            A,
-            (),
-            BlockTransposedRef<'a, f32, GROUP>,
-            MatRef<'a, Standard<f32>>,
-            &'a mut [f32],
-        >,
-{
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
-        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
-        self.arch.run3(
-            F32Kernel::<GROUP>,
-            self.prepared.reborrow(),
-            doc,
-            &mut scratch,
-        );
-        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
-            *dst = -src;
-        }
-    }
-
-    fn nrows(&self) -> usize {
-        self.prepared.nrows()
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
-
-impl diskann_wide::arch::Target1<Scalar, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: Scalar, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 8>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V3, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: V3, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V4, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: V4, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        // V4 delegates to V3 — the V3 micro-kernel is valid on V4 hardware.
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-impl diskann_wide::arch::Target1<Neon, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: Neon, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        // Neon delegates to Scalar.
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 8>(arch, query)),
-        }
-    }
-}
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
deleted file mode 100644
index fbe84fcd3..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-//! Architecture-opaque query computer with runtime dispatch.
-//!
-//! [`QueryComputer`] wraps a block-transposed query and a captured
-//! architecture token behind a trait-object vtable. CPU detection happens
-//! once at construction; every subsequent distance call goes through
-//! [`Architecture::run3`](diskann_wide::Architecture::run3) with full
-//! `#[target_feature]` propagation — no re-dispatch and no enum matching
-//! on the hot path.
-//!
-//! # Usage
-//!
-//! ```
-//! use diskann_quantization::multi_vector::{
-//!     QueryComputer, MatRef, Standard,
-//! };
-//!
-//! let query_data = [1.0f32, 0.0, 0.0, 1.0];
-//! let doc_data = [1.0f32, 0.0, 0.0, 1.0];
-//!
-//! let query = MatRef::new(Standard::new(2, 2).unwrap(), &query_data).unwrap();
-//! let doc = MatRef::new(Standard::new(2, 2).unwrap(), &doc_data).unwrap();
-//!
-//! // Build — runtime detects arch, picks optimal GROUP, captures both
-//! let computer = QueryComputer::<f32>::new(query);
-//!
-//! // Distance — vtable → arch.run3 with target_feature propagation
-//! let dist = computer.chamfer(doc);
-//! assert_eq!(dist, -2.0);
-//! ```
-
-mod f16;
-mod f32;
-
-use crate::multi_vector::{BlockTransposed, MatRef, Standard};
-
-/// Architecture-dispatched query computer for multi-vector distance.
-#[derive(Debug)]
-pub struct QueryComputer<T: Copy> {
-    inner: Box<dyn DynQueryComputer<T>>,
-}
-
-impl<T: Copy> QueryComputer<T> {
-    /// Number of logical (non-padded) query vectors.
-    #[inline]
-    pub fn nrows(&self) -> usize {
-        self.inner.nrows()
-    }
-
-    /// Compute Chamfer distance (sum of per-query max similarities, negated).
-    ///
-    /// Returns `0.0` if the document has zero vectors.
-    pub fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
-        let nq = self.nrows();
-        if doc.num_vectors() == 0 {
-            return 0.0;
-        }
-        let mut scores = vec![0.0f32; nq];
-        self.max_sim(doc, &mut scores);
-        scores.iter().sum()
-    }
-
-    /// Compute per-query-vector max similarities into `scores`.
-    ///
-    /// `scores` must have length equal to [`nrows()`](Self::nrows).
-    /// Each entry is the negated max inner product for that query vector.
-    ///
-    /// # Panics
-    ///
-    /// Panics if `scores.len() != self.nrows()`.
-    pub fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
-        let nq = self.nrows();
-        assert_eq!(
-            scores.len(),
-            nq,
-            "scores buffer not right size: {} != {}",
-            scores.len(),
-            nq
-        );
-
-        if doc.num_vectors() == 0 {
-            return;
-        }
-
-        self.inner.compute_max_sim(doc, scores);
-    }
-}
-
-trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
-    fn nrows(&self) -> usize;
-}
-
-#[derive(Debug)]
-struct Prepared<A, Q> {
-    arch: A,
-    prepared: Q,
-}
-
-fn build_prepared<T: Copy + Default, A, const GROUP: usize>(
-    arch: A,
-    query: MatRef<'_, Standard<T>>,
-) -> Prepared<A, BlockTransposed<T, GROUP>> {
-    let prepared = BlockTransposed::<T, GROUP>::from_matrix_view(query.as_matrix_view());
-    Prepared { arch, prepared }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::multi_vector::{Chamfer, MaxSim, QueryMatRef};
-    use diskann_vector::distance::InnerProduct;
-    use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
-
-    trait FromF32 {
-        fn from_f32(v: f32) -> Self;
-    }
-
-    impl FromF32 for f32 {
-        fn from_f32(v: f32) -> Self {
-            v
-        }
-    }
-
-    impl FromF32 for half::f16 {
-        fn from_f32(v: f32) -> Self {
-            diskann_wide::cast_f32_to_f16(v)
-        }
-    }
-
-    fn make_mat<T: Copy>(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard<T>> {
-        MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap()
-    }
-
-    fn make_test_data<T: FromF32>(len: usize, ceil: usize, shift: usize) -> Vec<T> {
-        (0..len)
-            .map(|v| T::from_f32(((v + shift) % ceil) as f32))
-            .collect()
-    }
-
-    /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback`
-    /// agreement checks: (num_queries, num_docs, dim).
-    ///
-    /// This matrix targets the API-layer wiring that lives above the
-    /// kernel — `QueryComputer::new` query setup, `chamfer` row
-    /// summation, `max_sim` per-row writeback, and the f16 query
-    /// conversion path — not kernel correctness. A small
-    /// representative set is sufficient because exhaustive shape
-    /// coverage (panel boundaries, B-remainder classes, prime `k`,
-    /// degenerate dims) is pinned one layer below in
-    /// `kernels::tiled_reduce::tests::NAIVE_CASES`, and structural
-    /// loop-path coverage in `tiled_reduce_all_loop_paths_match_naive`.
-    const TEST_CASES: &[(usize, usize, usize)] = &[
-        (1, 1, 4), // Degenerate
-        (5, 3, 5), // Prime k; nq > 1 and nd > 1 exercise chamfer summation
-        //              and per-row max_sim writeback on a non-trivial shape
-        (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths
-        (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2)
-    ];
-
-    fn check_chamfer_matches<T: Copy + FromF32>(
-        build: fn(MatRef<'_, Standard<T>>) -> QueryComputer<T>,
-        tol: f32,
-        label: &str,
-    ) where
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-    {
-        for &(nq, nd, dim) in TEST_CASES {
-            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
-            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
-
-            let query = make_mat(&query_data, nq, dim);
-            let doc = make_mat(&doc_data, nd, dim);
-
-            let expected = Chamfer::evaluate(QueryMatRef::from(query), doc);
-            let actual = build(query).chamfer(doc);
-
-            assert!(
-                (actual - expected).abs() < tol,
-                "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}",
-            );
-        }
-    }
-
-    fn check_max_sim_matches<T: Copy + FromF32>(
-        build: fn(MatRef<'_, Standard<T>>) -> QueryComputer<T>,
-        tol: f32,
-        label: &str,
-    ) where
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-    {
-        for &(nq, nd, dim) in TEST_CASES {
-            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
-            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
-
-            let query = make_mat(&query_data, nq, dim);
-            let doc = make_mat(&doc_data, nd, dim);
-
-            let mut expected_scores = vec![0.0f32; nq];
-            let _ = MaxSim::new(&mut expected_scores)
-                .unwrap()
-                .evaluate(QueryMatRef::from(query), doc);
-
-            let computer = build(query);
-            let mut actual_scores = vec![0.0f32; nq];
-            computer.max_sim(doc, &mut actual_scores);
-
-            for i in 0..nq {
-                assert!(
-                    (actual_scores[i] - expected_scores[i]).abs() < tol,
-                    "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}",
-                    actual_scores[i],
-                    expected_scores[i],
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn query_computer_dimensions() {
-        let data = vec![1.0f32; 5 * 8];
-        let query = make_mat(&data, 5, 8);
-        let computer = QueryComputer::<f32>::new(query);
-
-        assert_eq!(computer.nrows(), 5);
-    }
-
-    #[test]
-    fn query_computer_f16_dimensions() {
-        let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8];
-        let query = make_mat(data.as_slice(), 5, 8);
-        let computer = QueryComputer::<half::f16>::new(query);
-
-        assert_eq!(computer.nrows(), 5);
-    }
-
-    #[test]
-    fn chamfer_with_zero_docs() {
-        let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat(&[], 0, 2);
-        assert_eq!(computer.chamfer(doc), 0.0);
-    }
-
-    #[test]
-    fn max_sim_with_zero_docs() {
-        let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat::<f32>(&[], 0, 2);
-        let mut scores = vec![0.0f32; 2];
-        computer.max_sim(doc, &mut scores);
-        // With zero docs the scores buffer is left untouched.
-        for &s in &scores {
-            assert_eq!(s, 0.0, "zero-doc MaxSim should leave scores untouched");
-        }
-    }
-
-    #[test]
-    #[should_panic(expected = "scores buffer not right size")]
-    fn max_sim_panics_on_size_mismatch() {
-        let query = make_mat(&[1.0f32, 2.0, 3.0, 4.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat(&[1.0, 1.0], 1, 2);
-        let mut scores = vec![0.0f32; 3]; // Wrong size
-        computer.max_sim(doc, &mut scores);
-    }
-
-    macro_rules! test_matches_fallback {
-        ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => {
-            mod $mod_name {
-                use super::*;
-
-                #[test]
-                fn chamfer_matches_fallback() {
-                    check_chamfer_matches(QueryComputer::<$ty>::new, $tol, $label);
-                }
-
-                #[test]
-                fn max_sim_matches_fallback() {
-                    check_max_sim_matches(QueryComputer::<$ty>::new, $tol, $label);
-                }
-            }
-        };
-    }
-
-    test_matches_fallback!(f32, f32, 1e-10, "f32 ");
-    test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
-}
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index 70629d44c..31c430995 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -712,6 +712,13 @@ impl<T: NewCloned> Clone for Mat<T> {
 }
 
 impl<T: Copy> Mat<Standard<T>> {
+    /// Construct a [`Mat`] by calling `f` once per element in row-major order.
+    pub fn from_fn<F: FnMut() -> T>(repr: Standard<T>, mut f: F) -> Self {
+        let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect();
+        // SAFETY: `b` has length `repr.num_elements()` by construction.
+        unsafe { repr.box_to_mat(b) }
+    }
+
     /// Returns the raw dimension (columns) of the vectors in the matrix.
     #[inline]
     pub fn vector_dim(&self) -> usize {
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index 3670b1aaf..edeca4ef0 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -20,9 +20,12 @@
 //! | [`BlockTransposedRef`] | Immutable view of a block-transposed matrix |
 //! | [`BlockTransposedMut`] | Mutable view of a block-transposed matrix |
 //! | [`QueryMatRef`] | Query wrapper for asymmetric distances |
-//! | [`QueryComputer`] | Architecture-dispatched SIMD query computer |
 //! | [`MaxSim`] | Per-query-vector max similarity computation |
 //! | [`Chamfer`] | Asymmetric Chamfer distance (sum of MaxSim) |
+//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim`] |
+//! | [`MaxSimElement`] | Sealed trait gating element types the factory accepts |
+//! | [`MaxSimIsa`] | ISA selector for the factory functions |
+//! | [`Erase`] | BYOTE visitor used by the factory |
 //!
 //! # Example
 //!
@@ -72,7 +75,10 @@ pub mod distance;
 pub(crate) mod matrix;
 
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
-pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef};
+pub use distance::{
+    BoxErase, Chamfer, Erase, MaxSim, MaxSimElement, MaxSimError, MaxSimIsa, MaxSimKernel,
+    NotSupported, QueryMatRef, build_max_sim,
+};
 pub use matrix::{
     Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,
     Repr, ReprMut, ReprOwned, SliceError, Standard,