Harden RAE DiT conversion and pipeline helpers

plugyawn · plugyawn · commit b9d46caa0b3e · 2026-03-30T11:30:55.000+05:30
diff --git a/scripts/convert_rae_stage2_to_diffusers.py b/scripts/convert_rae_stage2_to_diffusers.py
@@ -184,6 +184,14 @@ def build_scheduler_config(config: dict[str, Any]) -> tuple[FlowMatchEulerDiscre
     misc = _resolve_section(config, "misc")
 
     transport_params = transport.get("params", {})
+    path_type = str(transport_params.get("path_type", "Linear"))
+    prediction = str(transport_params.get("prediction", "velocity"))
+    if path_type.lower() != "linear" or prediction.lower() != "velocity":
+        raise ValueError(
+            "Only `transport.params.path_type=Linear` with `transport.params.prediction=velocity` is "
+            "supported by this converter because it always saves a `FlowMatchEulerDiscreteScheduler`."
+        )
+
     latent_size = misc.get("latent_size", None)
     if latent_size is None:
         raise KeyError("Config must define `misc.latent_size` for scheduler conversion.")
@@ -200,8 +208,8 @@ def build_scheduler_config(config: dict[str, Any]) -> tuple[FlowMatchEulerDiscre
     metadata = {
         "num_train_timesteps": scheduler.config.num_train_timesteps,
         "shift": scheduler.config.shift,
-        "path_type": transport_params.get("path_type", "Linear"),
-        "prediction": transport_params.get("prediction", "velocity"),
+        "path_type": path_type,
+        "prediction": prediction,
         "time_dist_type": transport_params.get("time_dist_type", "uniform"),
     }
     return scheduler, metadata
@@ -307,20 +315,27 @@ def write_metadata(output_path: Path, metadata: dict[str, Any]) -> None:
 
 
 def resolve_input_path(accessor: RepoAccessor, path: str) -> Path:
+    expanded_path = Path(path).expanduser()
+    if expanded_path.is_absolute():
+        if expanded_path.is_file():
+            return expanded_path
+        raise FileNotFoundError(f"Absolute path does not exist: {expanded_path}")
+
     candidates = [path]
     if path.startswith("models/"):
         candidates.append(path[len("models/") :])
 
     for candidate in candidates:
-        local_path = Path(candidate)
-        if local_path.is_file():
-            return local_path
-
         try:
             return accessor.fetch(candidate)
         except FileNotFoundError:
             continue
 
+    for candidate in candidates:
+        local_path = Path(candidate).expanduser()
+        if local_path.is_file():
+            return local_path
+
     raise FileNotFoundError(f"Could not resolve `{path}` from `{accessor.repo_or_path}`.")
 
 
diff --git a/src/diffusers/pipelines/rae_dit/pipeline_rae_dit.py b/src/diffusers/pipelines/rae_dit/pipeline_rae_dit.py
@@ -249,7 +249,7 @@ def __call__(
         if output_type == "latent":
             output = latents
         else:
-            images = self.vae.decode(latents).sample.clamp(0, 1)
+            images = self.vae.decode(latents.to(dtype=self.vae.dtype)).sample.clamp(0, 1)
             output = self.image_processor.postprocess(images, output_type=output_type)
 
         self.maybe_free_model_hooks()
diff --git a/tests/others/test_rae_dit_conversion.py b/tests/others/test_rae_dit_conversion.py
@@ -13,12 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tempfile
+from pathlib import Path
 
+import pytest
 import torch
 
 from diffusers import AutoencoderRAE
 from scripts.convert_rae_stage2_to_diffusers import (
+    RepoAccessor,
+    build_scheduler_config,
+    resolve_input_path,
     translate_transformer_state_dict,
     unwrap_state_dict,
 )
@@ -78,6 +84,48 @@ def test_translate_transformer_state_dict_maps_gelu_keys():
     assert torch.equal(translated["blocks.0.mlp.net.2.weight"], fc2_weight)
 
 
+def test_build_scheduler_config_rejects_non_linear_or_non_velocity_transport():
+    with pytest.raises(ValueError):
+        build_scheduler_config(
+            {
+                "transport": {"params": {"path_type": "VP", "prediction": "velocity"}},
+                "misc": {"latent_size": [768, 16, 16]},
+            }
+        )
+
+    with pytest.raises(ValueError):
+        build_scheduler_config(
+            {
+                "transport": {"params": {"path_type": "Linear", "prediction": "epsilon"}},
+                "misc": {"latent_size": [768, 16, 16]},
+            }
+        )
+
+
+def test_resolve_input_path_prefers_repo_accessor_for_relative_paths():
+    original_cwd = Path.cwd()
+
+    with tempfile.TemporaryDirectory() as repo_tmpdir, tempfile.TemporaryDirectory() as cwd_tmpdir:
+        repo_root = Path(repo_tmpdir)
+        cwd_root = Path(cwd_tmpdir)
+
+        repo_config = repo_root / "configs" / "sample.yaml"
+        repo_config.parent.mkdir(parents=True, exist_ok=True)
+        repo_config.write_text("repo: true\n", encoding="utf-8")
+
+        cwd_config = cwd_root / "configs" / "sample.yaml"
+        cwd_config.parent.mkdir(parents=True, exist_ok=True)
+        cwd_config.write_text("cwd: true\n", encoding="utf-8")
+
+        os.chdir(cwd_root)
+        try:
+            resolved = resolve_input_path(RepoAccessor(str(repo_root)), "configs/sample.yaml")
+        finally:
+            os.chdir(original_cwd)
+
+    assert resolved == repo_config
+
+
 def test_autoencoder_rae_from_pretrained_loads_local_checkpoint():
     model = AutoencoderRAE(
         encoder_type="mae",
diff --git a/tests/pipelines/rae_dit/test_pipeline_rae_dit.py b/tests/pipelines/rae_dit/test_pipeline_rae_dit.py
@@ -188,6 +188,20 @@ def test_inference(self):
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 1e-4)
 
+    def test_inference_casts_latents_to_vae_dtype_before_decode(self):
+        components = self.get_dummy_components()
+        components["vae"] = components["vae"].to(dtype=torch.float64)
+        pipe = self.pipeline_class(**components).to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs("cpu")
+        inputs["output_type"] = "pt"
+
+        images = pipe(**inputs).images
+
+        self.assertEqual(images.shape, (1, 3, 4, 4))
+        self.assertTrue(torch.isfinite(images).all().item())
+
     def test_inference_classifier_free_guidance(self):
         pipe = self.pipeline_class(**self.get_dummy_components()).to("cpu")
         pipe.set_progress_bar_config(disable=None)