fix and update

rich7420 · rich7420 · commit 4e6b5344d7c4 · 2026-03-10T11:21:06.000+08:00
diff --git a/qdp/qdp-python/pyproject.toml b/qdp/qdp-python/pyproject.toml
@@ -49,10 +49,12 @@ benchmark = [
 
 [tool.uv.sources]
 qumat = { path = "../..", editable = true }
+torch = { index = "pytorch" }
 
+# CUDA 12.6 wheels to match driver (libnvJitLink 12_6); cu122 pulls libs that need 12_8 and fail.
 [[tool.uv.index]]
 name = "pytorch"
-url = "https://download.pytorch.org/whl/cu122"
+url = "https://download.pytorch.org/whl/cu126"
 explicit = true
 
 # Invalidate uv cache when Rust or Cargo changes so extension is rebuilt (run_throughput_pipeline_py etc.).
diff --git a/qdp/qdp-python/qumat_qdp/loader.py b/qdp/qdp-python/qumat_qdp/loader.py
@@ -31,14 +31,14 @@
 
 from collections.abc import Iterator
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 
 if TYPE_CHECKING:
     import _qdp
 
-# Optional torch for as_torch()/as_numpy(); import at use site to avoid hard dependency.
+# Optional torch for as_torch(); as_numpy() uses QuantumTensor.to_numpy() (no torch needed).
 try:
     import torch as _torch
 except ImportError:
@@ -145,7 +145,7 @@ def as_torch(self, device: str = "cuda") -> QuantumDataLoader:
         return self
 
     def as_numpy(self) -> QuantumDataLoader:
-        """Yield batches as NumPy arrays (CPU). Conversion is done inside the loader. Returns self."""
+        """Yield batches as NumPy float64 arrays (CPU). Uses QuantumTensor.to_numpy() — no PyTorch required. Returns self."""
         self._output_format = ("numpy",)
         return self
 
@@ -371,7 +371,8 @@ def _wrap_iterator(self, raw_iter: Iterator[object]) -> Iterator[Any]:
                 yield t.cpu() if device == "cpu" else t
         elif kind == "numpy":
             for qt in raw_iter:
-                yield _torch.from_dlpack(qt).cpu().numpy()
+                # Rust QuantumTensor has to_numpy(); raw_iter is Iterator[object]
+                yield cast(Any, qt).to_numpy()
         else:
             yield from raw_iter
 
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
@@ -722,6 +722,7 @@ impl QdpEngine {
             encoding_method,
             0,
             None,
+            qdp_core::reader::NullHandling::FillZero,
         );
         let engine = self.engine.clone();
         let iter = py
diff --git a/qdp/qdp-python/src/tensor.rs b/qdp/qdp-python/src/tensor.rs
@@ -14,10 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use numpy::{PyArray2, ndarray::Array2};
 use pyo3::exceptions::PyRuntimeError;
 use pyo3::ffi;
 use pyo3::prelude::*;
 use qdp_core::dlpack::DLManagedTensor;
+use std::ffi::c_void;
+
+// CUDA Runtime API — already linked transitively by qdp-core.
+unsafe extern "C" {
+    fn cudaMemcpy(dst: *mut c_void, src: *const c_void, count: usize, kind: i32) -> i32;
+}
+const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2;
 
 /// Quantum tensor wrapper implementing DLPack protocol
 ///
@@ -98,6 +106,100 @@ impl QuantumTensor {
         }
     }
 
+    /// Copy encoded quantum state from GPU to a NumPy array (CPU, float64).
+    ///
+    /// Performs a synchronous cudaMemcpy D2H without requiring PyTorch.
+    /// Complex128 output (imaginary parts are always 0.0 per the CUDA kernel)
+    /// is reduced to float64 by discarding the zero imaginary components.
+    ///
+    /// Returns:
+    ///     numpy.ndarray of shape (batch_size, state_len), dtype float64.
+    ///
+    /// Raises:
+    ///     RuntimeError: If the tensor has already been consumed, the pointer is
+    ///                   invalid, the dtype is unsupported, or the CUDA copy fails.
+    #[allow(clippy::wrong_self_convention)] // mut required: sets self.consumed and calls DLPack deleter
+    fn to_numpy<'py>(&mut self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
+        if self.consumed {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor already consumed (can only be used once)",
+            ));
+        }
+        if self.ptr.is_null() {
+            return Err(PyRuntimeError::new_err("Invalid DLPack tensor pointer"));
+        }
+
+        let (rows, cols, host_data) = unsafe {
+            let dl_tensor = &(*self.ptr).dl_tensor;
+
+            // Shape — require 1-D or 2-D.
+            let ndim = dl_tensor.ndim as usize;
+            if ndim == 0 || ndim > 2 || dl_tensor.shape.is_null() {
+                return Err(PyRuntimeError::new_err(
+                    "to_numpy() requires a 1-D or 2-D tensor",
+                ));
+            }
+            let shape = std::slice::from_raw_parts(dl_tensor.shape, ndim);
+            let (rows, cols) = if ndim == 1 {
+                (1usize, shape[0] as usize)
+            } else {
+                (shape[0] as usize, shape[1] as usize)
+            };
+
+            // Dtype: complex128 (DL_COMPLEX=5, bits=128) or float64 (DL_FLOAT=2, bits=64).
+            let dtype = &dl_tensor.dtype;
+            let (is_complex, elem_bytes) = match (dtype.code, dtype.bits) {
+                (5, 128) => (true, 16usize),
+                (2, 64) => (false, 8usize),
+                _ => {
+                    return Err(PyRuntimeError::new_err(format!(
+                        "to_numpy() unsupported dtype: code={}, bits={}",
+                        dtype.code, dtype.bits
+                    )));
+                }
+            };
+
+            let n_elems = rows * cols;
+            // For complex128 each element is two consecutive f64 values.
+            let host_f64_count = if is_complex { n_elems * 2 } else { n_elems };
+            let mut host_buf = vec![0.0f64; host_f64_count];
+
+            let data_ptr = (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize);
+
+            let ret = cudaMemcpy(
+                host_buf.as_mut_ptr() as *mut c_void,
+                data_ptr as *const c_void,
+                n_elems * elem_bytes,
+                CUDA_MEMCPY_DEVICE_TO_HOST,
+            );
+            if ret != 0 {
+                return Err(PyRuntimeError::new_err(format!(
+                    "cudaMemcpy D2H failed with error code {}",
+                    ret
+                )));
+            }
+
+            // Consumed: GPU memory is ours to free now.
+            self.consumed = true;
+            if let Some(deleter) = (*self.ptr).deleter {
+                deleter(self.ptr);
+            }
+
+            // complex128 → float64: discard imaginary parts (always 0.0).
+            let host_data: Vec<f64> = if is_complex {
+                host_buf.into_iter().step_by(2).collect()
+            } else {
+                host_buf
+            };
+
+            (rows, cols, host_data)
+        };
+
+        let arr = Array2::from_shape_vec((rows, cols), host_data)
+            .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+        Ok(PyArray2::from_owned_array(py, arr))
+    }
+
     /// Returns DLPack device information
     ///
     /// Returns:
@@ -122,8 +224,8 @@ impl QuantumTensor {
 
 impl Drop for QuantumTensor {
     fn drop(&mut self) {
-        // Only free if not consumed by __dlpack__
-        // If consumed, PyTorch/consumer will call the deleter
+        // Only free if not consumed; __dlpack__ leaves freeing to PyTorch,
+        // to_numpy() calls the deleter itself after the D2H copy.
         if !self.consumed && !self.ptr.is_null() {
             unsafe {
                 // Defensive check: qdp-core always provides a deleter
diff --git a/qdp/qdp-python/tests/test_quantum_data_loader.py b/qdp/qdp-python/tests/test_quantum_data_loader.py
@@ -16,6 +16,9 @@
 
 """tests for Quantum Data Loader."""
 
+from unittest.mock import patch
+
+import numpy as np
 import pytest
 
 try:
@@ -28,6 +31,15 @@ def _loader_available():
     return QuantumDataLoader is not None
 
 
+def _cuda_available():
+    try:
+        import torch
+
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
+
+
 @pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
 def test_mutual_exclusion_both_sources_raises() -> None:
     """Calling both .source_synthetic() and .source_file() then __iter__ raises ValueError."""
@@ -238,3 +250,134 @@ def test_source_file_s3_streaming_non_parquet_raises(path):
         )
     msg = str(exc_info.value).lower()
     assert "parquet" in msg or "streaming" in msg
+
+
+# --- as_torch() / as_numpy() output format tests ---
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+def test_as_torch_raises_at_config_time_when_torch_missing():
+    """as_torch() raises RuntimeError immediately (config time) when torch is not installed."""
+    with patch("qumat_qdp.loader._torch", None):
+        loader = QuantumDataLoader(device_id=0).qubits(4).batches(2, size=4)
+        with pytest.raises(RuntimeError) as exc_info:
+            loader.as_torch()
+        msg = str(exc_info.value)
+        assert "PyTorch" in msg or "torch" in msg.lower()
+        assert "pip install" in msg
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+def test_as_numpy_succeeds_at_config_time_without_torch():
+    """as_numpy() does not raise at config time even when torch is not installed."""
+    with patch("qumat_qdp.loader._torch", None):
+        loader = (
+            QuantumDataLoader(device_id=0)
+            .qubits(4)
+            .batches(2, size=4)
+            .source_synthetic()
+            .as_numpy()
+        )
+    assert loader._output_format == ("numpy",)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_yields_float64_arrays():
+    """as_numpy() yields numpy float64 arrays with correct shape; no torch required."""
+    num_qubits = 4
+    batch_size = 8
+    state_len = 2**num_qubits  # 16
+
+    batches = []
+    with patch("qumat_qdp.loader._torch", None):
+        loader = (
+            QuantumDataLoader(device_id=0)
+            .qubits(num_qubits)
+            .batches(3, size=batch_size)
+            .source_synthetic()
+            .as_numpy()
+        )
+        for batch in loader:
+            batches.append(batch)
+
+    assert len(batches) == 3
+    for batch in batches:
+        assert isinstance(batch, np.ndarray), f"expected ndarray, got {type(batch)}"
+        assert batch.dtype == np.float64, f"expected float64, got {batch.dtype}"
+        assert batch.ndim == 2
+        assert batch.shape == (batch_size, state_len), f"unexpected shape {batch.shape}"
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_amplitudes_are_unit_norm():
+    """Each row from as_numpy() should be a unit-norm state vector (amplitude encoding)."""
+    num_qubits = 4
+    batch_size = 16
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(2, size=batch_size)
+        .source_synthetic()
+        .as_numpy()
+    )
+    for batch in loader:
+        arr = np.asarray(batch, dtype=np.float64)
+        norms = np.linalg.norm(arr, axis=1)
+        np.testing.assert_allclose(norms, 1.0, atol=1e-5)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_torch_yields_cuda_tensors():
+    """as_torch(device='cuda') yields torch tensors on CUDA."""
+    try:
+        import torch
+    except ImportError:
+        pytest.skip("torch not installed")
+
+    num_qubits = 4
+    batch_size = 8
+    state_len = 2**num_qubits
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(2, size=batch_size)
+        .source_synthetic()
+        .as_torch(device="cuda")
+    )
+    for batch in loader:
+        assert isinstance(batch, torch.Tensor)
+        assert batch.is_cuda
+        assert batch.shape == (batch_size, state_len)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_from_source_array():
+    """as_numpy() works with source_array(), yielding correct shapes and dtype."""
+    num_qubits = 3
+    state_len = 2**num_qubits  # 8
+    n_samples = 12
+    batch_size = 4
+
+    rng = np.random.default_rng(42)
+    X = rng.standard_normal((n_samples, state_len))
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(1, size=batch_size)
+        .encoding("amplitude")
+        .source_array(X)
+        .as_numpy()
+    )
+    batches = list(loader)
+    assert len(batches) == n_samples // batch_size
+    for batch in batches:
+        assert isinstance(batch, np.ndarray)
+        assert batch.dtype == np.float64
+        assert batch.shape[1] == state_len
diff --git a/qdp/qdp-python/uv.lock b/qdp/qdp-python/uv.lock
diff --git a/uv.lock b/uv.lock