Skip to content
Open
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
9916b0b
adding async for datatrees
aladinor Sep 13, 2025
afa42e9
adding async method to _maybe_create_index
aladinor Sep 13, 2025
d469f2e
using async as complete instead of gathering results
aladinor Sep 13, 2025
d53498a
adding tests for open_group, open_dtree and _maybe_create_index using…
aladinor Sep 14, 2025
3b26dd6
Merge branch 'main' into async-dtreec
aladinor Sep 14, 2025
b5ab48a
ensuing _maybe_create_default_indexes_async is compatible with zarr v2
aladinor Sep 14, 2025
94a9efd
resolving the mypy type errors
aladinor Sep 14, 2025
288a818
Merge branch 'async-dtreec' of https://github.com/aladinor/xarray int…
aladinor Sep 14, 2025
573a700
attemp 2: resolving mypy type errors
aladinor Sep 14, 2025
7557261
Merge branch 'main' into async-dtreec
aladinor Sep 15, 2025
3c10a23
Merge branch 'main' into async-dtreec
aladinor Oct 8, 2025
013804c
Merge branch 'main' into async-dtreec
aladinor Dec 9, 2025
f4ca679
Merge branch 'main' into async-dtreec
aladinor Dec 12, 2025
531c589
refactor: consolidate async index creation for DataTree opening
aladinor Dec 12, 2025
0ee2a73
perf: remove unnecessary semaphore from async datatree opening
aladinor Dec 12, 2025
6d6cd1e
fix: add zarr v2 fallback for datatree opening
aladinor Dec 12, 2025
640081b
Merge branch 'main' into async-dtreec
aladinor Dec 13, 2025
0ee154e
updating whats-new.rst file
aladinor Dec 13, 2025
542cad3
Merge branch 'async-dtreec' of https://github.com/aladinor/xarray int…
aladinor Dec 13, 2025
31b50dc
fix: re-add semaphore to async datatree opening to prevent deadlocks
aladinor Dec 13, 2025
b6a3b27
Merge branch 'main' into async-dtreec
aladinor Dec 15, 2025
6c4d9e4
Merge branch 'main' into async-dtreec
aladinor Dec 20, 2025
a711167
Merge branch 'main' into async-dtreec
aladinor Dec 22, 2025
2127aa5
Merge branch 'main' into async-dtreec
aladinor Dec 29, 2025
5c2f62c
Merge branch 'main' into async-dtreec
aladinor Jan 5, 2026
3a698d2
Merge branch 'main' into async-dtreec
aladinor Jan 10, 2026
2501b2a
refactor: use async index creation in sync open_datatree for zarr
aladinor Jan 10, 2026
c3ec77e
fix: add type ignore for mypy arg-type error in open_datatree_async
aladinor Jan 10, 2026
9bf810e
fix: add type annotations and fix Windows path in test
aladinor Jan 10, 2026
10e4756
fix: add type annotations to nested async functions for mypy
aladinor Jan 10, 2026
6f97d9c
Merge branch 'main' into async-dtreec
aladinor Jan 13, 2026
fbf4617
refactor: remove public open_datatree_async API per review feedback
aladinor Jan 14, 2026
c53f0bd
Merge branch 'main' into async-dtreec
aladinor Jan 15, 2026
5da5adc
refactor: convert _build_group_members to module-level helper function
aladinor Jan 16, 2026
1debb7c
fix: add cast for mypy type checking in _build_group_members
aladinor Jan 16, 2026
06bdab4
Merge branch 'main' into async-dtreec
keewis Jan 16, 2026
480b872
Update xarray/backends/api.py
aladinor Jan 16, 2026
02ee46b
refactor: use sync index creation in _maybe_create_default_indexes_async
aladinor Jan 16, 2026
b008620
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 16, 2026
422d127
Update xarray/backends/api.py
aladinor Jan 16, 2026
179c20c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 16, 2026
a4844b5
Update xarray/backends/api.py
aladinor Jan 16, 2026
b58ffb3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ Internal Changes
Performance
~~~~~~~~~~~

- Improve performance of :py:func:`open_datatree` for zarr stores by using async/concurrent
loading of groups and indexes (:pull:`10742`).
By `Alfonso Ladino <https://github.com/aladinor>`_.
- Add a fastpath to the backend plugin system for standard engines (:issue:`10178`, :pull:`10937`).
By `Sam Levang <https://github.com/slevang>`_.

Expand Down
65 changes: 64 additions & 1 deletion xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
NestedSequence,
T_Chunks,
)
from xarray.core.variable import Variable

T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"]
T_Engine = Union[
Expand Down Expand Up @@ -348,7 +349,37 @@ def _datatree_from_backend_datatree(

_protect_datatree_variables_inplace(backend_tree, cache)
if create_default_indexes:
tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
_use_zarr_async = False
if engine == "zarr":
from xarray.backends.zarr import _zarr_v3

_use_zarr_async = _zarr_v3()

if _use_zarr_async:
from zarr.core.sync import sync as zarr_sync

async def create_indexes_async() -> dict[str, Dataset]:
import asyncio

results: dict[str, Dataset] = {}
tasks = [
_create_index_for_node(path, node.dataset)
for path, [node] in group_subtrees(backend_tree)
]
for fut in asyncio.as_completed(tasks):
path, ds = await fut
results[path] = ds
return results

async def _create_index_for_node(
path: str, ds: Dataset
) -> tuple[str, Dataset]:
return path, await _maybe_create_default_indexes_async(ds)

results = zarr_sync(create_indexes_async())
tree = DataTree.from_dict(results, name=backend_tree.name)
else:
tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
else:
tree = backend_tree
if chunks is not None:
Expand Down Expand Up @@ -385,6 +416,38 @@ def _datatree_from_backend_datatree(
return tree


async def _maybe_create_default_indexes_async(ds: Dataset) -> Dataset:
"""Create default indexes for dimension coordinates asynchronously.

This function parallelizes both data loading and index creation,
which can significantly speed up opening datasets with many coordinates.
"""
import asyncio

to_index_names = [
name
for name, coord in ds.coords.items()
if coord.dims == (name,) and name not in ds.xindexes
]

if not to_index_names:
return ds

async def load_var(var: Variable) -> Variable:
try:
return await var.load_async()
except NotImplementedError:
return await asyncio.to_thread(var.load)

await asyncio.gather(
*[load_var(ds.coords[name].variable) for name in to_index_names]
)

variables = {name: ds.variables[name] for name in to_index_names}
new_coords = Coordinates(variables)
return ds.assign_coords(new_coords)


def open_dataset(
filename_or_obj: T_PathFileOrDataStore,
*,
Expand Down
Loading
Loading