[BugFix] enable deepseek r1 fp4

ZLkanyo009 · zovonoir · commit b3dd131d45ae · 2026-04-13T03:00:30.000Z
diff --git a/atom/plugin/sglang/attention_backend/radix_attention.py b/atom/plugin/sglang/attention_backend/radix_attention.py
@@ -88,6 +88,8 @@ def __init__(
                     torch.tensor([1.0], dtype=torch.float32, device="cuda"),
                     requires_grad=False,
                 )
+            if self.attn.k_scale_float is None:
+                self.attn.k_scale_float = 1.0
             if self.attn.v_scale is None:
                 self.attn.v_scale = torch.nn.Parameter(
                     torch.tensor([1.0], dtype=torch.float32, device="cuda"),
diff --git a/atom/plugin/sglang/attention_backend/sgl_attention_mla.py b/atom/plugin/sglang/attention_backend/sgl_attention_mla.py
@@ -909,10 +909,14 @@ def _split_and_assign_kc_vc(
             w_vc = w_vc.contiguous()
         attn.w_vc = bind_or_assign(attn.w_vc, w_vc)
 
-        if hasattr(attn.kv_b_proj, "weight_scale") and attn.w_scale is None:
-            attn.w_scale = bind_or_assign(attn.w_scale, attn.kv_b_proj.weight_scale)
-            if _is_hip:
-                attn.w_scale *= 2.0
+        kv_weight_scale = getattr(attn.kv_b_proj, "weight_scale", None)
+        if (
+            kv_weight_scale is not None
+            and attn.w_scale is None
+            and w.dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz)
+        ):
+            scale = kv_weight_scale * 2.0 if _is_hip else kv_weight_scale
+            attn.w_scale = bind_or_assign(attn.w_scale, scale)
 
         if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
             attn.w_kc = attn.w_kc.to(torch.bfloat16) * attn.w_scale

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,8 @@ def __init__(`
`88`	`88`	`torch.tensor([1.0], dtype=torch.float32, device="cuda"),`
`89`	`89`	`requires_grad=False,`
`90`	`90`	`)`
	`91`	`+ if self.attn.k_scale_float is None:`
	`92`	`+ self.attn.k_scale_float = 1.0`
`91`	`93`	`if self.attn.v_scale is None:`
`92`	`94`	`self.attn.v_scale = torch.nn.Parameter(`
`93`	`95`	`torch.tensor([1.0], dtype=torch.float32, device="cuda"),`