Skip to content
7 changes: 5 additions & 2 deletions qdp/qdp-core/src/dlpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,16 @@ impl GpuStateVector {
/// # Safety
/// Freed by DLPack deleter when PyTorch releases tensor.
/// Do not free manually.
#[allow(clippy::manual_is_multiple_of)]
pub fn to_dlpack(&self) -> *mut DLManagedTensor {
// Always return 2D tensor: Batch [num_samples, state_len], Single [1, state_len]
let (shape, strides) = if let Some(num_samples) = self.num_samples {
// Batch: [num_samples, state_len_per_sample]
debug_assert!(
num_samples > 0 && self.size_elements.is_multiple_of(num_samples),
"Batch state vector size must be divisible by num_samples"
num_samples > 0 && self.size_elements % num_samples == 0,
"Batch mismatch: {} elements cannot be evenly divided into {} samples",
self.size_elements,
num_samples
);
let state_len_per_sample = self.size_elements / num_samples;
let shape = vec![num_samples as i64, state_len_per_sample as i64];
Expand Down
212 changes: 212 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/amplitude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,218 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}

/// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs
#[cfg(target_os = "linux")]
fn encode_batch_f32(
&self,
device: &Arc<CudaDevice>,
batch_data: &[f32],
num_samples: usize,
sample_size: usize,
num_qubits: usize,
) -> Result<GpuStateVector> {
crate::profile_scope!("AmplitudeEncoder::encode_batch_f32");

// Validate inputs. Wait, Preprocessor::validate_batch currently takes f64...
// We will just do a basic length check if f32 validation is missing.
let state_len = 1 << num_qubits;
if batch_data.len() != num_samples * sample_size {
return Err(MahoutError::InvalidInput(
"batch_data length mismatch".into(),
));
}
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

encode_batch_f32 validates only batch_data.len() == num_samples * sample_size, but (unlike the f64 path) it does not reject sample_size == 0 or sample_size > 2^num_qubits. Those cases can lead to out-of-bounds behavior in kernels. Add the same input checks as the existing f64 implementation and improve the error to include expected vs actual length.

Copilot uses AI. Check for mistakes.

let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch_f32");
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
};

// Upload input data to GPU
let input_batch_gpu = {
crate::profile_scope!("GPU::H2D_InputBatch_f32");
device.htod_sync_copy(batch_data).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to upload batch input: {:?}", e))
})?
};

// Compute inverse norms on GPU using warp-reduced kernel
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel_f32");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
})?;

let ret = unsafe {
launch_l2_norm_batch_f32(
*input_batch_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f32,
std::ptr::null_mut(), // default stream
)
};

if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel failed: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};

// Validate norms on host
{
crate::profile_scope!("GPU::NormValidation_f32");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;

if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}

// Launch batch kernel
{
crate::profile_scope!("GPU::BatchKernelLaunch_f32");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch_f32(
*input_batch_gpu.device_ptr() as *const f32,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
state_len,
std::ptr::null_mut(), // default stream
)
};

if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch kernel launch failed: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}

{
crate::profile_scope!("GPU::Synchronize");
device
.synchronize()
.map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
}

Ok(batch_state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
device: &Arc<CudaDevice>,
input_batch_d: *const c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
let state_len = 1 << num_qubits;
if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"Sample size {} exceeds state vector size {} (2^{} qubits)",
sample_size, state_len, num_qubits
)));
}
let input_batch_d = input_batch_d as *const f32;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch_f32");
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel_f32");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
})?;
let ret = unsafe {
launch_l2_norm_batch_f32(
input_batch_d,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f32,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};
{
crate::profile_scope!("GPU::NormValidation_f32");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}
{
crate::profile_scope!("GPU::BatchKernelLaunch_f32");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch_f32(
input_batch_d,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
state_len,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch kernel launch failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(batch_state_vector)
}

fn name(&self) -> &'static str {
"amplitude"
}
Expand Down
35 changes: 35 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,41 @@ pub trait QuantumEncoder: Send + Sync {
self.name()
)))
}

/// Encode multiple samples in a single GPU allocation and kernel launch using f32 inputs.
fn encode_batch_f32(
&self,
_device: &Arc<CudaDevice>,
_batch_data: &[f32],
_num_samples: usize,
_sample_size: usize,
_num_qubits: usize,
) -> Result<GpuStateVector> {
Err(MahoutError::NotImplemented(format!(
"encode_batch_f32 not implemented for {}",
self.name()
)))
}

/// Encode batch from existing GPU pointer (zero-copy) for f32 inputs.
///
/// # Safety
/// Caller must ensure `input_batch_d` points to valid GPU memory (f32).
#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
_device: &Arc<CudaDevice>,
_input_batch_d: *const c_void,
_num_samples: usize,
_sample_size: usize,
_num_qubits: usize,
_stream: *mut c_void,
) -> Result<GpuStateVector> {
Err(MahoutError::NotImplemented(format!(
"encode_batch_from_gpu_ptr_f32 not supported for {}",
self.name()
)))
}
}

// Encoding implementations
Expand Down
Loading
Loading