Finish resume support across modal fit paths

MaxGhenis · MaxGhenis · commit 6acc3eb241d3 · 2026-04-09T11:47:36.000-04:00
diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py
@@ -166,6 +166,7 @@ def _trigger_repository_dispatch(event_type: str = "calibration-updated"):
 def _fit_weights_impl(
     branch: str,
     epochs: int,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -183,6 +184,7 @@ def _fit_weights_impl(
     artifacts = artifacts_dir if artifacts_dir else f"{PIPELINE_MOUNT}/artifacts"
     db_path = f"{artifacts}/policy_data.db"
     dataset_path = f"{artifacts}/source_imputed_stratified_extended_cps.h5"
+    checkpoint_path = f"{artifacts}/{output_prefix}calibration_checkpoint.pt"
     for label, p in [("database", db_path), ("dataset", dataset_path)]:
         if not os.path.exists(p):
             raise RuntimeError(
@@ -203,7 +205,11 @@ def _fit_weights_impl(
         db_path,
         "--dataset",
         dataset_path,
+        "--checkpoint-output",
+        checkpoint_path,
     ]
+    if os.path.exists(checkpoint_path):
+        cmd.extend(["--resume-from", checkpoint_path])
     if target_config:
         cmd.extend(["--target-config", target_config])
     if not skip_county:
@@ -212,11 +218,15 @@ def _fit_weights_impl(
         cmd.extend(["--workers", str(workers)])
     _append_hyperparams(cmd, beta, lambda_l0, lambda_l2, learning_rate, log_freq)
 
-    cal_rc, cal_lines = _run_streaming(
-        cmd,
-        env=os.environ.copy(),
-        label="calibrate",
-    )
+    try:
+        cal_rc, cal_lines = _run_streaming(
+            cmd,
+            env=os.environ.copy(),
+            label="calibrate",
+        )
+    finally:
+        if os.path.exists(checkpoint_path):
+            pipeline_vol.commit()
     if cal_rc != 0:
         raise RuntimeError(f"Script failed with code {cal_rc}")
 
@@ -277,15 +287,17 @@ def _fit_from_package_impl(
 
     print(f"Running command: {' '.join(cmd)}", flush=True)
 
-    cal_rc, cal_lines = _run_streaming(
-        cmd,
-        env=os.environ.copy(),
-        label="calibrate",
-    )
+    try:
+        cal_rc, cal_lines = _run_streaming(
+            cmd,
+            env=os.environ.copy(),
+            label="calibrate",
+        )
+    finally:
+        if os.path.exists(checkpoint_path):
+            pipeline_vol.commit()
     if cal_rc != 0:
         raise RuntimeError(f"Script failed with code {cal_rc}")
-
-    pipeline_vol.commit()
     return _collect_outputs(cal_lines)
 
 
@@ -511,6 +523,7 @@ def check_volume_package(artifacts_dir: str = "") -> dict:
 def fit_weights_t4(
     branch: str = "main",
     epochs: int = 200,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -522,14 +535,15 @@ def fit_weights_t4(
     artifacts_dir: str = "",
 ) -> dict:
     return _fit_weights_impl(
-        branch,
-        epochs,
-        target_config,
-        beta,
-        lambda_l0,
-        lambda_l2,
-        learning_rate,
-        log_freq,
+        branch=branch,
+        epochs=epochs,
+        output_prefix=output_prefix,
+        target_config=target_config,
+        beta=beta,
+        lambda_l0=lambda_l0,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+        log_freq=log_freq,
         skip_county=skip_county,
         workers=workers,
         artifacts_dir=artifacts_dir,
@@ -548,6 +562,7 @@ def fit_weights_t4(
 def fit_weights_a10(
     branch: str = "main",
     epochs: int = 200,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -559,14 +574,15 @@ def fit_weights_a10(
     artifacts_dir: str = "",
 ) -> dict:
     return _fit_weights_impl(
-        branch,
-        epochs,
-        target_config,
-        beta,
-        lambda_l0,
-        lambda_l2,
-        learning_rate,
-        log_freq,
+        branch=branch,
+        epochs=epochs,
+        output_prefix=output_prefix,
+        target_config=target_config,
+        beta=beta,
+        lambda_l0=lambda_l0,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+        log_freq=log_freq,
         skip_county=skip_county,
         workers=workers,
         artifacts_dir=artifacts_dir,
@@ -585,6 +601,7 @@ def fit_weights_a10(
 def fit_weights_a100_40(
     branch: str = "main",
     epochs: int = 200,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -596,14 +613,15 @@ def fit_weights_a100_40(
     artifacts_dir: str = "",
 ) -> dict:
     return _fit_weights_impl(
-        branch,
-        epochs,
-        target_config,
-        beta,
-        lambda_l0,
-        lambda_l2,
-        learning_rate,
-        log_freq,
+        branch=branch,
+        epochs=epochs,
+        output_prefix=output_prefix,
+        target_config=target_config,
+        beta=beta,
+        lambda_l0=lambda_l0,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+        log_freq=log_freq,
         skip_county=skip_county,
         workers=workers,
         artifacts_dir=artifacts_dir,
@@ -622,6 +640,7 @@ def fit_weights_a100_40(
 def fit_weights_a100_80(
     branch: str = "main",
     epochs: int = 200,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -633,14 +652,15 @@ def fit_weights_a100_80(
     artifacts_dir: str = "",
 ) -> dict:
     return _fit_weights_impl(
-        branch,
-        epochs,
-        target_config,
-        beta,
-        lambda_l0,
-        lambda_l2,
-        learning_rate,
-        log_freq,
+        branch=branch,
+        epochs=epochs,
+        output_prefix=output_prefix,
+        target_config=target_config,
+        beta=beta,
+        lambda_l0=lambda_l0,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+        log_freq=log_freq,
         skip_county=skip_county,
         workers=workers,
         artifacts_dir=artifacts_dir,
@@ -659,6 +679,7 @@ def fit_weights_a100_80(
 def fit_weights_h100(
     branch: str = "main",
     epochs: int = 200,
+    output_prefix: str = "",
     target_config: str = None,
     beta: float = None,
     lambda_l0: float = None,
@@ -670,14 +691,15 @@ def fit_weights_h100(
     artifacts_dir: str = "",
 ) -> dict:
     return _fit_weights_impl(
-        branch,
-        epochs,
-        target_config,
-        beta,
-        lambda_l0,
-        lambda_l2,
-        learning_rate,
-        log_freq,
+        branch=branch,
+        epochs=epochs,
+        output_prefix=output_prefix,
+        target_config=target_config,
+        beta=beta,
+        lambda_l0=lambda_l0,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+        log_freq=log_freq,
         skip_county=skip_county,
         workers=workers,
         artifacts_dir=artifacts_dir,
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
@@ -1149,9 +1149,7 @@ def run_calibration(
             "block_geoid": package.get("block_geoid"),
             "base_n_records": package_base_n_records,
             "n_clones": (
-                int(package_n_clones)
-                if package_n_clones is not None
-                else n_clones
+                int(package_n_clones) if package_n_clones is not None else n_clones
             ),
         }
         return (
diff --git a/tests/unit/test_remote_calibration_runner.py b/tests/unit/test_remote_calibration_runner.py