apache · rich7420 · Mar 2, 2026 · Mar 3, 2026 · Mar 10, 2026 · Mar 24, 2026
@@ -245,21 +245,9 @@ impl QuantumEncoder for AmplitudeEncoder {
             buffer
         };
 
-        // Validate norms on host to catch zero or NaN samples early
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
-
-        // Launch batch kernel
+        // Launch batch encode kernel — takes GPU norm buffer directly, no D2H needed yet.
+        // We defer the norm validation D2H copy until AFTER the encode kernel + sync so that
+        // the norm kernel → encode kernel sequence runs without an intermediate GPU-CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
@@ -288,14 +276,30 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
         }
 
-        // Synchronize
+        // Synchronize — all GPU work (norm + encode) complete after this point.
         {
             crate::profile_scope!("GPU::Synchronize");
             device
                 .synchronize()
                 .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
         }
 
+        // Validate norms on host AFTER sync: D2H copy no longer blocks the encode kernel.
+        // This preserves error detection for zero/NaN samples without adding a mid-pipeline
+        // GPU-CPU roundtrip between the norm and encode kernels.
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
         Ok(batch_state_vector)
     }
 
@@ -412,17 +416,8 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
             buffer
         };
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
+        // Launch encode kernel before D2H norm validation: GPU norm buffer is passed directly,
+        // so the encode kernel can run immediately after the norm kernel without a CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             use cudarc::driver::DevicePtr;
@@ -450,10 +445,22 @@ impl QuantumEncoder for AmplitudeEncoder {
                 )));
             }
         }
+        // Synchronize first; then validate norms on host (D2H after all GPU work is done).
         {
             crate::profile_scope!("GPU::Synchronize");
             sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
         }
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
         Ok(batch_state_vector)
     }
 

@@ -431,6 +431,47 @@ impl PipelineIterator {
         })
     }
 
+    /// Create a pipeline iterator from an in-memory array (e.g. from Python numpy).
+    /// Data is owned by the iterator; the full encode loop runs in Rust (take_batch + encode_batch).
+    pub fn new_from_array(
+        engine: QdpEngine,
+        data: Vec<f64>,
+        num_samples: usize,
+        sample_size: usize,
+        config: PipelineConfig,
+        batch_limit: usize,
+    ) -> Result<Self> {
+        let vector_len = vector_len(config.num_qubits, &config.encoding_method);
+        if sample_size != vector_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array sample_size {} does not match vector_len {} for num_qubits={}, encoding={}",
+                sample_size, vector_len, config.num_qubits, config.encoding_method
+            )));
+        }
+        if data.len() != num_samples * sample_size {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array length {} is not num_samples ({}) * sample_size ({})",
+                data.len(),
+                num_samples,
+                sample_size
+            )));
+        }
+        let source = DataSource::InMemory {
+            data,
+            cursor: 0,
+            num_samples,
+            sample_size,
+            batches_yielded: 0,
+            batch_limit,
+        };
+        Ok(Self {
+            engine,
+            config,
+            source,
+            vector_len,
+        })
+    }
+
     /// Create a pipeline iterator from a Parquet file using streaming read (Phase 2b).
     /// Only `.parquet` is supported; reduces memory for large files by reading in chunks.
     /// Validates sample_size == vector_len after the first chunk.

@@ -184,3 +184,25 @@ uv run python benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py --hel
 uv run python benchmark/encoding_benchmarks/qdp_pipeline/mnist_amplitude.py --help
 uv run python benchmark/encoding_benchmarks/qdp_pipeline/svhn_iqp.py --help
 ```
+
+## Credit Card Fraud amplitude baseline (PennyLane)
+
+Minimal, reproducible steps (run from `qdp/qdp-python`):
+
+1. **Download dataset (once)** — Kaggle `creditcard.csv` mirror:
+
+   ```bash
+   mkdir -p benchmark/encoding_benchmarks/pennylane_baseline/data
+   curl -L -o benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
+   ```
+
+2. **Run the PennyLane baseline** — StandardScaler → PCA(16) → L2 norm → 4‑qubit amplitude VQC:
+
+   ```bash
+   uv run python benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py \
+     --data-file benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     --max-samples 300000 --iters 200 --batch-size 512 --trials 1
+   ```
+
+This prints compile time, train time / throughput, and task metrics (AUPRC, F1, precision, recall) on the test set.