datastax · MarkWolters · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -213,7 +213,7 @@ jobs:
             java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \
               ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \
               -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \
-              -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG} dpr-1M
+              -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG} dpr-gemma-1m
           else
             java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \
               ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \

@@ -22,6 +22,7 @@
 import io.github.jbellis.jvector.example.util.CheckpointManager;
 import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
 import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
+import io.github.jbellis.jvector.example.yaml.DatasetCollection;
 import io.github.jbellis.jvector.example.yaml.MultiConfig;
 
 import org.slf4j.Logger;
@@ -50,22 +51,7 @@
  */
 public class AutoBenchYAML {
     private static final Logger logger = LoggerFactory.getLogger(AutoBenchYAML.class);
-
-    /**
-     * Returns a list of all dataset names.
-     * This replaces the need to load datasets.yml which may not be available in all environments.
-     */
-    private static List<String> getAllDatasetNames() {
-        List<String> allDatasets = new ArrayList<>();
-        allDatasets.add("cap-1M");
-        allDatasets.add("cap-6M");
-        allDatasets.add("cohere-english-v3-1M");
-        allDatasets.add("cohere-english-v3-10M");
-        allDatasets.add("dpr-1M");
-        allDatasets.add("dpr-10M");
-
-        return allDatasets;
-    }
+    private static final String REGRESSION_TEST_KEY = "regression-tests";
 
     public static void main(String[] args) throws IOException {
         // Check for --output argument (required for this class)
@@ -112,7 +98,8 @@ public static void main(String[] args) throws IOException {
         // compile regex and do substring matching using find
         var pattern = Pattern.compile(regex);
 
-        var datasetNames = getAllDatasetNames().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
+        var datasetCollection = DatasetCollection.load();
+        var datasetNames = datasetCollection.getSection(REGRESSION_TEST_KEY).stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
 
         logger.info("Executing the following datasets: {}", datasetNames);
         List<BenchResult> results = new ArrayList<>();
@@ -148,7 +135,7 @@ public static void main(String[] args) throws IOException {
                             config.dataset = normalizedDatasetName;
                         }
                     } else {
-                        config = MultiConfig.getDefaultConfig("autoDefault");
+                        config = MultiConfig.getDefaultConfig(normalizedDatasetName);
                         config.dataset = normalizedDatasetName;
                     }
                     logger.info("Using configuration: {}", config);

@@ -17,6 +17,7 @@
 package io.github.jbellis.jvector.example.yaml;
 
 import org.yaml.snakeyaml.Yaml;
+import software.amazon.awssdk.http.auth.aws.internal.signer.chunkedencoding.Chunk;
 
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -54,4 +55,17 @@ public List<String> getAll() {
         }
         return allDatasetNames;
     }
+
+    public List<String> getSection(String section) {
+        List<String> sectionDatasetNames = new ArrayList<>();
+        for (var key : datasetNames.keySet()) {
+            if (key.equals(section)) {
+                var subList = datasetNames.get(key);
+                if (subList != null) {
+                    sectionDatasetNames.addAll(subList);
+                }
+            }
+        }
+        return sectionDatasetNames;
+    }
 }
@@ -51,6 +51,12 @@ dpr-1M:
 dpr-10M:
   similarity_function: COSINE
   load_behavior: NO_SCRUB
+dpr-gemma-1m:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
+dpr-gemma-10m:
+  similarity_function: COSINE
+  load_behavior: NO_SCRUB
 e5-small-v2-100k:
   similarity_function: COSINE
   load_behavior: NO_SCRUB

@@ -19,6 +19,13 @@ ann-benchmarks:
   - glove-200-angular
   - nytimes-256-angular
   - sift-128-euclidean
+regression-tests:
+  - cap-1M
+  - cap-6M
+  - cohere-english-v3-1M
+  - cohere-english-v3-10M
+  - dpr-gemma-1m
+  - dpr-gemma-10m
 #other-datasets:
 #  - dpr-1M
 #  - dpr-10M

@@ -0,0 +1,36 @@
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: cap-10M
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -1,19 +1,20 @@
-version: 6
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
 
-dataset: auto-default
+dataset: cap-1M
 
 construction:
   outDegree: [32]
   efConstruction: [100]
   neighborOverflow: [1.2f]
   addHierarchy: [Yes]
   refineFinalGraph: [Yes]
-  fusedGraph: [No]
+  fusedGraph: [Yes]
   compression:
     - type: PQ
       parameters:
-        m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
-        # mFactor: 8
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
         # k: 256 # optional parameter. By default, k=256
         centerData: No
         anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -28,7 +29,8 @@ search:
   compression:
     - type: PQ
       parameters:
-        m: 192
+        # m: 192
+        mFactor: 8
         # k: 256 # optional parameter. By default, k=256
         centerData: No
         anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -0,0 +1,36 @@
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: cohere-english-v3-10M
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -0,0 +1,36 @@
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: cohere-english-v3-1M
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -0,0 +1,36 @@
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: dpr-gemma-10M
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -0,0 +1,36 @@
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: dpr-gemma-1M
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: PQ
+      parameters:
+        # m: 192
+        mFactor: 8
+        # k: 256 # optional parameter. By default, k=256
+        centerData: No
+        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)