fix: PTQ 1GPU, export PP divisibility, hidden states conversations key

ChenhanYu · claude · ChenhanYu · commit 6e456b0d44b9 · 2026-04-18T15:27:31.000-07:00
- megatron_lm_ptq.yaml: Qwen3-8B to single GPU for L40 clusters
- quantize.sh: auto-find largest PP dividing model num_hidden_layers
  for export (Qwen3-8B has 36 layers, not divisible by 8)
- compute_hidden_states_trtllm.py: use messages with conversations
  fallback (matching the HF version)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Chenhan Yu &lt;chenhany@nvidia.com&gt;
diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py
@@ -256,7 +256,7 @@ async def submit_generates():
         for entry in dataset:
             conversation_id = entry.get("conversation_id", entry.get("uuid"))
 
-            conversations = entry["conversations"]
+            conversations = entry.get("messages") or entry.get("conversations")
             if not conversations or not isinstance(conversations, list):
                 num_invalid += 1
                 continue
diff --git a/tools/launcher/common/megatron_lm/quantize/quantize.sh b/tools/launcher/common/megatron_lm/quantize/quantize.sh
@@ -41,11 +41,22 @@ TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${QUANTIZE_EXE} ${MLM_MODEL_CF
 export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm"
 TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
 
-# Export quantized checkpoint to HF format (PP=all GPUs)
+# Export quantized checkpoint to HF format
+# Use largest PP <= total GPUs that divides the model's num_hidden_layers
 TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1})
-echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${TOTAL_GPUS}) ==="
+EXPORT_PP=$(python3 -c "
+import json, os
+cfg = os.path.join('${HF_MODEL_CKPT}', 'config.json')
+n_layers = json.load(open(cfg)).get('num_hidden_layers', 1) if os.path.exists(cfg) else 1
+gpus = ${TOTAL_GPUS}
+pp = gpus
+while pp > 1 and n_layers % pp != 0:
+    pp -= 1
+print(pp)
+" 2>/dev/null || echo ${TOTAL_GPUS})
+echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${EXPORT_PP}, ${TOTAL_GPUS} GPUs) ==="
 export MLM_EXTRA_ARGS=
-TP=1 PP=${TOTAL_GPUS} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
+TP=1 PP=${EXPORT_PP} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
 ls ${EXPORT_DIR}
 cat ${EXPORT_DIR}/hf_quant_config.json
 
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml
@@ -24,7 +24,7 @@ pipeline:
     config:
       model: Qwen/Qwen3-8B
       quant_cfg: NVFP4_DEFAULT_CFG
-      tp: 8
+      tp: 1
       calib_dataset: abisee/cnn_dailymail
       calib_size: 32
       mmlu_dataset: cais/mmlu
@@ -33,15 +33,15 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 8
-      gpus_per_node: 8
+      ntasks_per_node: 1
+      gpus_per_node: 1
 
   task_1:
     _target_: common.megatron_lm.quantize.task.MegatronLMQuantizeTask
     config:
       model: Qwen/Qwen3-8B
       quant_cfg: FP8_DEFAULT_CFG
-      tp: 8
+      tp: 1
       calib_dataset: abisee/cnn_dailymail
       calib_size: 32
       mmlu_dataset: cais/mmlu
@@ -50,18 +50,18 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 8
-      gpus_per_node: 8
+      ntasks_per_node: 1
+      gpus_per_node: 1
 
   # Step 3: TRT-LLM eval MMLU on all exported checkpoints
   task_2:
     script: common/tensorrt_llm/eval.sh
     environment:
       - HF_MODEL_CKPT: /scratchspace/export
-      - TP: "8"
+      - TP: "1"
       - EP: "1"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 8
+      gpus_per_node: 1