Fix LocalBackend fork_checkpoint to overwrite initial LoRA for vLLM

arcticfly · claude · arcticfly · commit dc20d8fe6dcc · 2026-04-13T13:57:14.000-07:00
When forking a checkpoint, the source checkpoint was copied to checkpoints/{source_step} in the destination model directory. However, model.register(backend) already created an empty LoRA at checkpoints/0000. When vLLM starts, it loads @0 — the empty 0000 checkpoint — not the forked one. Fix by also copying the forked weights to checkpoints/0000 so vLLM loads the correct weights on startup. Fixes #651 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -1434,6 +1434,15 @@ async def _experimental_fork_checkpoint(
 
         shutil.copytree(source_checkpoint_dir, dest_checkpoint_dir)
 
+        # Also overwrite the initial empty checkpoint at step 0 so vLLM
+        # loads the forked weights on startup (it uses @0 by default)
+        step0_dir = get_step_checkpoint_dir(dest_model_dir, 0)
+        if os.path.exists(step0_dir) and step0_dir != dest_checkpoint_dir:
+            if verbose:
+                print(f"Overwriting initial checkpoint at {step0_dir} with forked weights")
+            shutil.rmtree(step0_dir)
+            shutil.copytree(dest_checkpoint_dir, step0_dir)
+
         if verbose:
             print(
                 f"Successfully forked checkpoint from {from_model} (step {selected_step}) to {model.name}"