From a8b77e7bb32657e134517431aec5aa5eaf9c2eb6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 27 May 2026 13:07:23 -0700 Subject: [PATCH 1/2] polygonize: auto-detect attrs['transform'] for output georeferencing (#2536) `polygonize(raster, return_type='geopandas')` already auto-detects `attrs['crs']` and stamps it on the returned `GeoDataFrame`. It never read `attrs['transform']`, so the geometries stayed in raw pixel space even when the raster carried a valid affine transform. The result was a `GeoDataFrame` whose `.crs` claimed projected space while the geometries were not in that space -- silent misalignment by the raster's origin offset, worse than the missing-CRS bug (#2149) because downstream callers cannot detect the inconsistency from the `GeoDataFrame` alone. Add `_detect_raster_transform()` parallel to `_detect_raster_crs()` with resolution order: 1. `raster.attrs['transform']` (xrspatial.geotiff convention, a rasterio-ordered 6-tuple) 2. `raster.rio.transform()` (rioxarray, if installed) 3. `None` Skip auto-detection when `attrs['_xrspatial_no_georef']=True` (the xrspatial.geotiff no-georef marker) and when `rio.transform()` returns the identity (rioxarray's default when no transform info is available -- otherwise the output would silently shift by zero and look identical to "no transform"). Malformed `attrs['transform']` (wrong length, non-numeric) is silently ignored, matching the existing unparseable-CRS-attr policy. An explicit `transform=` argument still wins over the attr. The geometries themselves are transformed inside `_scan`, so this applies to all return types -- the "numpy", "awkward", "spatialpandas", and "geojson" outputs all benefit. The `simplify_tolerance` path is unaffected (simplification runs after transform application). --- xrspatial/polygonize.py | 80 ++++++++++++- xrspatial/tests/test_polygonize.py | 178 +++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+), 2 deletions(-) diff --git a/xrspatial/polygonize.py b/xrspatial/polygonize.py index 32de5186b..cb7914493 100644 --- a/xrspatial/polygonize.py +++ b/xrspatial/polygonize.py @@ -599,6 +599,62 @@ def _detect_raster_crs(raster: xr.DataArray): return None +def _detect_raster_transform(raster: xr.DataArray): + """Detect the affine transform of an input raster. + + Returns a rasterio-ordered 6-tuple ``(pixel_width, 0.0, origin_x, + 0.0, pixel_height, origin_y)`` -- the same layout that + ``_transform_points`` consumes -- or ``None`` if no transform is + available. + + Resolution order, parallel to ``_detect_raster_crs``: + + 1. ``raster.attrs['transform']`` (xrspatial.geotiff convention, an + already rasterio-ordered 6-tuple). + 2. ``raster.rio.transform()`` (rioxarray, if installed). Returns an + ``affine.Affine``; iterating it yields the rasterio-ordered 6 + coefficients ``(a, b, c, d, e, f)``. + 3. ``None``. + + A raster carrying ``attrs['_xrspatial_no_georef']=True`` (the + xrspatial.geotiff "no georeference" marker) is treated as having + no transform even if ``rio.transform()`` is present, because that + marker explicitly opts out of georeferencing. + """ + # Honour the xrspatial.geotiff "no georeference" marker if set. + if raster.attrs.get('_xrspatial_no_georef'): + return None + + transform_attr = raster.attrs.get('transform') + if transform_attr is not None: + try: + t = tuple(float(v) for v in transform_attr) + except (TypeError, ValueError): + t = None + if t is not None and len(t) == 6: + return t + + try: + rio_transform = raster.rio.transform() + if rio_transform is not None: + # rasterio's Affine iterates as (a, b, c, d, e, f); the + # first 6 coefficients are the rasterio-ordered tuple. The + # 7th-9th elements are the (0, 0, 1) projective row, which + # _transform_points does not need. + t = tuple(float(v) for v in tuple(rio_transform)[:6]) + # rioxarray returns the identity affine when no transform + # is available, which would silently shift outputs by (0,0) + # and look identical to "no transform". Treat the identity + # the same as None so callers get an unambiguous answer. + if t == (1.0, 0.0, 0.0, 0.0, 1.0, 0.0): + return None + return t + except Exception: + pass + + return None + + def _to_geopandas( column: List[Union[int, float]], polygon_points: List[np.ndarray], @@ -1890,6 +1946,18 @@ def polygonize( value is dropped rather than raised. The ``spatialpandas`` and ``geojson`` return types do not carry CRS metadata: spatialpandas has no CRS slot, and GeoJSON (RFC 7946) is WGS84 only. + + When ``transform`` is not supplied explicitly, the raster's affine + transform is auto-detected in this order: ``raster.attrs['transform']`` + (xrspatial.geotiff convention, a rasterio-ordered 6-tuple), then + ``raster.rio.transform()`` (if rioxarray is installed). An explicit + ``transform=`` argument always overrides the auto-detected value. + Auto-detection is skipped when the raster carries + ``attrs['_xrspatial_no_georef']=True``. This applies to all return + types -- the geometries themselves are transformed, so the + coordinates emitted in the "numpy", "awkward", "spatialpandas" and + "geojson" outputs are also in CRS coordinate space, not pixel + space, when the raster carries a transform. """ _validate_raster(raster, func_name='polygonize', name='raster', ndim=2) if raster.shape[0] < 1 or raster.shape[1] < 1: @@ -1917,9 +1985,17 @@ def polygonize( f"connectivity must be either 4 or 8, not {connectivity}") connectivity_8 = (connectivity == 8) - # Check transform. + # Check transform. When the caller did not pass an explicit + # transform, fall back to one carried on the raster (attrs[ + # 'transform'] or rio.transform()). This keeps the + # ``return_type='geopandas'`` output consistent with the + # auto-detected CRS: pre-#2536, the GeoDataFrame got the raster's + # CRS but stayed in pixel space, so the metadata lied about the + # data. See _detect_raster_transform() for the resolution order. + if transform is None: + transform = _detect_raster_transform(raster) if transform is not None: - transform = np.asarray(transform) + transform = np.asarray(transform, dtype=np.float64) if len(transform) != 6: raise ValueError( f"Incorrect transform length of {len(transform)} instead of 6") diff --git a/xrspatial/tests/test_polygonize.py b/xrspatial/tests/test_polygonize.py index 859d50cd5..c54e10004 100644 --- a/xrspatial/tests/test_polygonize.py +++ b/xrspatial/tests/test_polygonize.py @@ -1364,6 +1364,184 @@ def test_no_crs_attr_simplify_still_works(self): assert df.crs.to_epsg() == 4326 +# --- transform attr propagation tests (issue #2536) --- +# +# polygonize used to propagate attrs['crs'] but ignore attrs['transform']. +# The result was a GeoDataFrame whose .crs claimed projected space while +# the geometries were in pixel space (silent misalignment). The fix auto- +# detects attrs['transform'] (or rio.transform()) when no explicit +# transform= is passed, mirroring the CRS auto-detection. + +@pytest.mark.skipif(gpd is None, reason="geopandas not installed") +class TestPolygonizeTransformPropagation: + """polygonize auto-detects attrs['transform'] when not given one (#2536).""" + + # rasterio-ordered transform: 10 m pixels, origin at (1e6, 5e6). + # _transform_points consumes this as + # x = a*col + b*row + c, y = d*col + e*row + f. + _TRANSFORM = (10.0, 0.0, 1_000_000.0, 0.0, -10.0, 5_000_000.0) + + @staticmethod + def _raster(**attrs): + data = np.array([[1, 1, 2], [1, 1, 2], [2, 2, 2]], dtype=np.int32) + return xr.DataArray( + data, + dims=('y', 'x'), + coords={'y': np.arange(3), 'x': np.arange(3)}, + attrs=attrs, + ) + + def test_transform_attr_auto_detected_numpy(self): + """attrs['transform'] is applied even when transform= is omitted.""" + raster = self._raster(transform=self._TRANSFORM) + _, polys = polygonize(raster, return_type='numpy') + # Polygons should span the CRS origin offset, not pixel space. + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + all_ys = np.concatenate([p[0][:, 1] for p in polys]) + assert all_xs.min() >= 1_000_000.0 - 1e-6 + assert all_xs.max() <= 1_000_000.0 + 30.0 + 1e-6 + # pixel_height is negative so y decreases from origin. + assert all_ys.max() <= 5_000_000.0 + 1e-6 + assert all_ys.min() >= 5_000_000.0 - 30.0 - 1e-6 + + def test_transform_attr_auto_detected_geopandas(self): + """The geopandas path applies the auto-detected transform.""" + raster = self._raster(crs='EPSG:3857', transform=self._TRANSFORM) + df = polygonize(raster, return_type='geopandas') + assert df.crs is not None and df.crs.to_epsg() == 3857 + bounds = df.geometry.total_bounds # [minx, miny, maxx, maxy] + assert bounds[0] >= 1_000_000.0 - 1e-6 + assert bounds[2] <= 1_000_000.0 + 30.0 + 1e-6 + assert bounds[1] >= 5_000_000.0 - 30.0 - 1e-6 + assert bounds[3] <= 5_000_000.0 + 1e-6 + + def test_explicit_transform_overrides_attr(self): + """An explicit transform= wins over attrs['transform'].""" + raster = self._raster(transform=self._TRANSFORM) + # Identity-ish: pixel space, no offset. Output must be in 0..3. + identity = np.array([1.0, 0.0, 0.0, 0.0, 1.0, 0.0]) + _, polys = polygonize(raster, transform=identity, + return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + all_ys = np.concatenate([p[0][:, 1] for p in polys]) + assert all_xs.max() <= 3.0 + 1e-6 + assert all_ys.max() <= 3.0 + 1e-6 + + def test_no_transform_info_yields_pixel_coords(self): + """A raster with no transform info keeps pixel-space coords.""" + raster = self._raster() # no attrs + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + all_ys = np.concatenate([p[0][:, 1] for p in polys]) + assert all_xs.max() <= 3.0 + 1e-6 + assert all_ys.max() <= 3.0 + 1e-6 + + def test_no_georef_marker_suppresses_auto_detect(self): + """attrs['_xrspatial_no_georef']=True opts out of auto-detect even + when attrs['transform'] is also set (defensive: should not happen + in practice, but a malformed dict must not silently mis-transform).""" + raster = self._raster( + transform=self._TRANSFORM, _xrspatial_no_georef=True) + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + assert all_xs.max() <= 3.0 + 1e-6 + + def test_transform_attr_list_form_auto_detected(self): + """attrs['transform'] stored as a list (not a tuple) still works.""" + raster = self._raster(transform=list(self._TRANSFORM)) + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + assert all_xs.min() >= 1_000_000.0 - 1e-6 + + def test_transform_attr_invalid_length_falls_back(self): + """A malformed (wrong-length) attrs['transform'] is silently + ignored, parallel to how an unparseable attrs['crs'] is dropped + rather than raised. The user did not opt in to that transform + explicitly, so falling back to pixel coords is friendlier than + breaking the call.""" + raster = self._raster(transform=(1.0, 0.0, 0.0)) # only 3 elements + _, polys = polygonize(raster, return_type='numpy') + # Falls back to pixel-space coords. + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + assert all_xs.max() <= 3.0 + 1e-6 + + def test_explicit_transform_invalid_length_still_raises(self): + """An EXPLICIT transform= of wrong length must still raise -- + only the attrs path is silent on malformed input.""" + raster = self._raster() + with pytest.raises(ValueError, match="Incorrect transform length"): + polygonize(raster, transform=(1.0, 0.0, 0.0), return_type='numpy') + + def test_transform_attr_unparseable_falls_back(self): + """A non-numeric attrs['transform'] is ignored, not raised.""" + raster = self._raster(transform=("not", "a", "transform")) + # Falls back to pixel coords; no exception. + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + assert all_xs.max() <= 3.0 + 1e-6 + + def test_simplify_works_with_auto_detected_transform(self): + """simplify_tolerance still works after auto-detected transform.""" + # Larger raster to leave room for simplification. + data = np.zeros((10, 10), dtype=np.int32) + data[2:8, 2:8] = 1 + raster = xr.DataArray( + data, + dims=('y', 'x'), + attrs={'crs': 'EPSG:3857', 'transform': self._TRANSFORM}, + ) + df = polygonize( + raster, return_type='geopandas', simplify_tolerance=1.0) + assert df.crs.to_epsg() == 3857 + # Should land near the projected origin. + assert df.geometry.total_bounds[0] >= 1_000_000.0 - 1e-6 + + @pytest.mark.skipif(da is None, reason="dask not installed") + def test_transform_attr_auto_detected_dask(self): + """Dask backend honours auto-detected attrs['transform'].""" + data = np.array([[1, 1, 2], [1, 1, 2], [2, 2, 2]], dtype=np.int32) + dask_data = da.from_array(data, chunks=(2, 2)) + raster = xr.DataArray( + dask_data, + dims=('y', 'x'), + coords={'y': np.arange(3), 'x': np.arange(3)}, + attrs={'transform': self._TRANSFORM}, + ) + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + all_ys = np.concatenate([p[0][:, 1] for p in polys]) + assert all_xs.min() >= 1_000_000.0 - 1e-6 + assert all_ys.max() <= 5_000_000.0 + 1e-6 + + def test_rio_transform_auto_detected(self): + """rio.transform() is used when attrs['transform'] is absent. + + rioxarray derives ``rio.transform()`` from the DataArray's x/y + coords when they are present, so the test sets coords that + encode the desired transform rather than calling + ``rio.write_transform`` (whose stamped value is overridden by + any existing coords). + """ + pytest.importorskip("rioxarray") + # 10 m pixels, origin at (1e6, 5e6), pixel_height negative. The + # coord values are pixel-centre coordinates, so the implied + # upper-left corner is (1e6, 5e6). + data = np.array([[1, 1, 2], [1, 1, 2], [2, 2, 2]], dtype=np.int32) + xs = 1_000_000.0 + 10.0 * np.arange(3) + 5.0 # centres + ys = 5_000_000.0 - 10.0 * np.arange(3) - 5.0 + raster = xr.DataArray( + data, + dims=('y', 'x'), + coords={'y': ys, 'x': xs}, + ) + # Sanity: rio derives the right transform from the coords. + derived = tuple(raster.rio.transform())[:6] + assert derived == self._TRANSFORM, derived + _, polys = polygonize(raster, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + assert all_xs.min() >= 1_000_000.0 - 1e-6 + + # --- atol / rtol tolerance parameter tests (issue #2173) --- # # The float pathway in _calculate_regions groups adjacent pixels using an From a2835e92448f34d51f03acd119ccbb6f373e6685 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 27 May 2026 13:11:46 -0700 Subject: [PATCH 2/2] Address review nits: CHANGELOG, mask/connectivity coverage (#2536) - CHANGELOG.md: add an Unreleased "Fixed" entry calling out the behavioural change (auto-detected transform now applies to callers who previously got pixel coords from a georef'd raster), with the opt-out recipe. - test_polygonize.py: add `test_transform_attr_with_mask` to verify the auto-detected transform composes with the `mask=` argument. - test_polygonize.py: parametrize `test_transform_attr_works_with_ both_connectivities` over `connectivity in (4, 8)` so a future change that decouples `_scan`'s transform call site from one of the connectivities cannot regress silently. - test_polygonize.py: add a comment on `test_no_georef_marker_suppresses_auto_detect` explaining that the state it exercises is not produced by xrspatial.geotiff in practice (mutually exclusive attrs), so the test guards against malformed third-party dicts rather than a known read path. The identity-affine short-circuit nit was left as-is per the review write-up: applying identity is a no-op, the existing comment already documents the rationale. --- CHANGELOG.md | 14 ++++++++ xrspatial/tests/test_polygonize.py | 51 +++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb01efd39..6b341eacb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ ### Unreleased +#### Fixed + +- `polygonize(return_type='geopandas')` now auto-detects the raster's + affine transform from `attrs['transform']` (xrspatial.geotiff + convention) or `rio.transform()` (rioxarray) when the caller did + not pass an explicit `transform=` argument. Previously the function + auto-detected `attrs['crs']` and stamped it on the `GeoDataFrame` + while leaving the geometries in pixel space, so the CRS attribute + claimed projected space but the coordinates did not match. Callers + who relied on the pre-fix behaviour and expected pixel-space + geometries from a rasterio-loaded raster should pass `transform= + np.array([1, 0, 0, 0, 1, 0])` explicitly, or strip the transform + attr before calling polygonize. (#2536) + ### Version 0.10.0 - 2026-05-27 diff --git a/xrspatial/tests/test_polygonize.py b/xrspatial/tests/test_polygonize.py index c54e10004..0932b05fd 100644 --- a/xrspatial/tests/test_polygonize.py +++ b/xrspatial/tests/test_polygonize.py @@ -1439,7 +1439,14 @@ def test_no_transform_info_yields_pixel_coords(self): def test_no_georef_marker_suppresses_auto_detect(self): """attrs['_xrspatial_no_georef']=True opts out of auto-detect even when attrs['transform'] is also set (defensive: should not happen - in practice, but a malformed dict must not silently mis-transform).""" + in practice, but a malformed dict must not silently mis-transform). + + Note: ``xrspatial.geotiff`` never produces a raster with both attrs + set simultaneously -- see ``xrspatial/geotiff/_attrs.py:782-785``, + which only writes ``_xrspatial_no_georef=True`` when + ``has_georef=False`` and in that branch does NOT write + ``attrs['transform']``. This test guards against a hand-built or + future-third-party attrs dict that violates that invariant.""" raster = self._raster( transform=self._TRANSFORM, _xrspatial_no_georef=True) _, polys = polygonize(raster, return_type='numpy') @@ -1513,6 +1520,48 @@ def test_transform_attr_auto_detected_dask(self): assert all_xs.min() >= 1_000_000.0 - 1e-6 assert all_ys.max() <= 5_000_000.0 + 1e-6 + @pytest.mark.parametrize("connectivity", [4, 8]) + def test_transform_attr_works_with_both_connectivities(self, connectivity): + """Auto-detected transform applies for connectivity=4 and 8 alike. + + The transform is applied inside ``_scan`` after region tracing, so + the connectivity choice should not interact with the transform + path. Parametrize to catch any future change that decouples one + connectivity from the transform call site. + """ + raster = self._raster(transform=self._TRANSFORM) + _, polys = polygonize( + raster, connectivity=connectivity, return_type='numpy') + all_xs = np.concatenate([p[0][:, 0] for p in polys]) + all_ys = np.concatenate([p[0][:, 1] for p in polys]) + assert all_xs.min() >= 1_000_000.0 - 1e-6 + assert all_xs.max() <= 1_000_000.0 + 30.0 + 1e-6 + assert all_ys.max() <= 5_000_000.0 + 1e-6 + assert all_ys.min() >= 5_000_000.0 - 30.0 - 1e-6 + + def test_transform_attr_with_mask(self): + """Auto-detected transform composes with a mask= argument. + + The mask filters which pixels participate in polygonization but + does not touch the transform path; the surviving polygons should + still emerge in CRS space. + """ + raster = self._raster(transform=self._TRANSFORM) + # Mask out the right column (col index 2) so only the value-1 + # region in cols 0-1 remains. + mask_data = np.array( + [[True, True, False], + [True, True, False], + [False, False, False]], dtype=bool) + mask = xr.DataArray(mask_data, dims=('y', 'x')) + df = polygonize(raster, mask=mask, return_type='geopandas') + assert len(df) == 1 # only the value-1 polygon remains + # The surviving polygon should land at the CRS origin, not pixel 0. + bounds = df.geometry.total_bounds + assert bounds[0] >= 1_000_000.0 - 1e-6 + assert bounds[2] <= 1_000_000.0 + 20.0 + 1e-6 + assert bounds[3] <= 5_000_000.0 + 1e-6 + def test_rio_transform_auto_detected(self): """rio.transform() is used when attrs['transform'] is absent.