From bc34dfe47e0fd7c85349d9d93b62c1ebc16f4ca2 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Thu, 18 Jun 2026 15:44:39 +0000
Subject: [PATCH 1/2] Give workgroup barriers their memory-fence flags

`barrier(0)` lowers to an `OpControlBarrier` with `SequentiallyConsistent`
semantics but no storage-class bit, which the SPIR-V spec treats as
ordering no memory. So shared-local (and global) writes are not
guaranteed visible to other work-items after the barrier, which can
silently drop updates (e.g. a workgroup local-atomic accumulation losing
counts).

Pass the appropriate fence flags so the barrier actually orders memory:
`LOCAL_MEM_FENCE | GLOBAL_MEM_FENCE` for KA `@synchronize` (matching CUDA
`__syncthreads`), and `LOCAL_MEM_FENCE` for the mapreduce reduce_group
shared-memory tree.
---
 src/mapreduce.jl     | 6 +++++-
 src/oneAPIKernels.jl | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 822b9b16..645db2cd 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -33,7 +33,11 @@
     # perform a reduction
     d = 1
     while d < items
-        barrier(0)
+        # Fence local memory: `barrier(0)` lowers to an OpControlBarrier without the
+        # WorkgroupMemory storage-class bit, which does not order the shared-local tree
+        # accesses across the barrier. Fence local memory so each tree step sees the
+        # previous step's `shared[]` writes.
+        barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE)
         index = 2 * d * (item-1) + 1
         @inbounds if index <= items
             other_val = if index + d <= items
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index 6e092397..bc6f3218 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -214,7 +214,13 @@ end
 ## Synchronization and Printing
 
 @device_override @inline function KA.__synchronize()
-    barrier(0)
+    # Fence both local and global memory across the workgroup barrier, matching CUDA
+    # `__syncthreads` semantics. `barrier(0)` lowers to `OpControlBarrier` with
+    # `SequentiallyConsistent` but WITHOUT any storage-class bit, which the SPIR-V spec
+    # treats as ordering *no* memory — so shared-local or global writes are not guaranteed
+    # visible to other work-items after the barrier. `LOCAL_MEM_FENCE | GLOBAL_MEM_FENCE`
+    # ORs in the WorkgroupMemory/CrossWorkgroupMemory fence bits.
+    barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE | SPIRVIntrinsics.GLOBAL_MEM_FENCE)
 end
 
 @device_override @inline function KA.__print(args...)

From f572343bb6082bd998c614db426de3bac2419c62 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Mon, 22 Jun 2026 10:03:43 -0500
Subject: [PATCH 2/2] Fix oneLocalArray docstring example to fence the barrier

The matmul tiling example called `barrier()` (no method exists; the only
signature is `barrier(flags)`) and demonstrated the unfenced pattern that
orders no memory. Use `barrier(oneAPI.LOCAL_MEM_FENCE)` so the public
example matches the corrected guidance and actually fences the local-memory
tile writes.
---
 src/device/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/array.jl b/src/device/array.jl
index 24822656..75ba7908 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -325,7 +325,7 @@ function matmul_kernel(A, B, C)
     tile_A[local_i, local_j] = A[...]
     tile_B[local_i, local_j] = B[...]
 
-    barrier()  # Synchronize workgroup
+    barrier(oneAPI.LOCAL_MEM_FENCE)  # synchronize the workgroup, fencing local memory
 
     # Compute using local memory
     # ...