apache
diff --git a/‎qdp/qdp-core/src/dlpack.rs‎
Lines changed: 5 additions & 2 deletions b/‎qdp/qdp-core/src/dlpack.rs‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 226 additions & 0 deletions b/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 226 additions & 0 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/mod.rs‎
Lines changed: 35 additions & 0 deletions b/‎qdp/qdp-core/src/gpu/encodings/mod.rs‎
Lines changed: 35 additions & 0 deletions
@@ -252,13 +252,16 @@ impl GpuStateVector {
     /// # Safety
     /// Freed by DLPack deleter when PyTorch releases tensor.
     /// Do not free manually.
+    #[allow(clippy::manual_is_multiple_of)]
     pub fn to_dlpack(&self) -> *mut DLManagedTensor {
         // Always return 2D tensor: Batch [num_samples, state_len], Single [1, state_len]
         let (shape, strides) = if let Some(num_samples) = self.num_samples {
             // Batch: [num_samples, state_len_per_sample]
             debug_assert!(
-                num_samples > 0 && self.size_elements.is_multiple_of(num_samples),
-                "Batch state vector size must be divisible by num_samples"
+                num_samples > 0 && self.size_elements % num_samples == 0,
+                "Batch mismatch: {} elements cannot be evenly divided into {} samples",
+                self.size_elements,
+                num_samples
             );
             let state_len_per_sample = self.size_elements / num_samples;
             let shape = vec![num_samples as i64, state_len_per_sample as i64];
 
@@ -457,6 +457,232 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
+    /// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs
+    #[cfg(target_os = "linux")]
+    fn encode_batch_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        batch_data: &[f32],
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        crate::profile_scope!("AmplitudeEncoder::encode_batch_f32");
+
+        let state_len = 1 << num_qubits;
+
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "sample_size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "sample_size {} exceeds state vector length {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+        if batch_data.len() != num_samples * sample_size {
+            return Err(MahoutError::InvalidInput(format!(
+                "batch_data length mismatch (expected {} * {} = {}, got {})",
+                num_samples,
+                sample_size,
+                num_samples * sample_size,
+                batch_data.len()
+            )));
+        }
+
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch_f32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+
+        // Upload input data to GPU
+        let input_batch_gpu = {
+            crate::profile_scope!("GPU::H2D_InputBatch_f32");
+            device.htod_sync_copy(batch_data).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to upload batch input: {:?}", e))
+            })?
+        };
+
+        // Compute inverse norms on GPU using warp-reduced kernel
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel_f32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
+            })?;
+
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    *input_batch_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+
+        // Validate norms on host
+        {
+            crate::profile_scope!("GPU::NormValidation_f32");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
+        // Launch batch kernel
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch_f32");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    *input_batch_gpu.device_ptr() as *const f32,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            device
+                .synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
+        }
+
+        Ok(batch_state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+        let input_batch_d = input_batch_d as *const f32;
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch_f32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel_f32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::NormValidation_f32");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch_f32");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn name(&self) -> &'static str {
         "amplitude"
     }
 
@@ -134,6 +134,41 @@ pub trait QuantumEncoder: Send + Sync {
             self.name()
         )))
     }
+
+    /// Encode multiple samples in a single GPU allocation and kernel launch using f32 inputs.
+    fn encode_batch_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _batch_data: &[f32],
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_f32 not implemented for {}",
+            self.name()
+        )))
+    }
+
+    /// Encode batch from existing GPU pointer (zero-copy) for f32 inputs.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_batch_d` points to valid GPU memory (f32).
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_batch_d: *const c_void,
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_from_gpu_ptr_f32 not supported for {}",
+            self.name()
+        )))
+    }
 }
 
 // Encoding implementations