File: test_dask.py

package info (click to toggle)
python-xarray 2025.03.1-8
links: PTS, VCS
area: main
in suites: sid, trixie
size: 11,608 kB
sloc: python: 110,134; makefile: 255; sh: 53
file content (1830 lines) | stat: -rw-r--r-- 64,791 bytes
from __future__ import annotations

import operator
import pickle
import sys
from contextlib import suppress
from textwrap import dedent

import numpy as np
import pandas as pd
import pytest

import xarray as xr
import xarray.ufuncs as xu
from xarray import DataArray, Dataset, Variable
from xarray.core import duck_array_ops
from xarray.core.duck_array_ops import lazy_array_equiv
from xarray.core.indexes import PandasIndex
from xarray.testing import assert_chunks_equal
from xarray.tests import (
    assert_allclose,
    assert_array_equal,
    assert_equal,
    assert_frame_equal,
    assert_identical,
    mock,
    raise_if_dask_computes,
    requires_pint,
    requires_scipy_or_netCDF4,
)
from xarray.tests.test_backends import create_tmp_file

dask = pytest.importorskip("dask")
da = pytest.importorskip("dask.array")
dd = pytest.importorskip("dask.dataframe")

ON_WINDOWS = sys.platform == "win32"


def test_raise_if_dask_computes():
    data = da.from_array(np.random.default_rng(0).random((4, 6)), chunks=(2, 2))
    with pytest.raises(RuntimeError, match=r"Too many computes"):
        with raise_if_dask_computes():
            data.compute()


class DaskTestCase:
    def assertLazyAnd(self, expected, actual, test):
        with dask.config.set(scheduler="synchronous"):
            test(actual, expected)

        if isinstance(actual, Dataset):
            for k, v in actual.variables.items():
                if k in actual.xindexes:
                    assert isinstance(v.data, np.ndarray)
                else:
                    assert isinstance(v.data, da.Array)
        elif isinstance(actual, DataArray):
            assert isinstance(actual.data, da.Array)
            for k, v in actual.coords.items():
                if k in actual.xindexes:
                    assert isinstance(v.data, np.ndarray)
                else:
                    assert isinstance(v.data, da.Array)
        elif isinstance(actual, Variable):
            assert isinstance(actual.data, da.Array)
        else:
            raise AssertionError()


class TestVariable(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_identical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_allclose)

    @pytest.fixture(autouse=True)
    def setUp(self):
        self.values = np.random.default_rng(0).random((4, 6))
        self.data = da.from_array(self.values, chunks=(2, 2))

        self.eager_var = Variable(("x", "y"), self.values)
        self.lazy_var = Variable(("x", "y"), self.data)

    def test_basics(self):
        v = self.lazy_var
        assert self.data is v.data
        assert self.data.chunks == v.chunks
        assert_array_equal(self.values, v)

    def test_copy(self):
        self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy())
        self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy(deep=True))

    def test_chunk(self):
        for chunks, expected in [
            ({}, ((2, 2), (2, 2, 2))),
            (3, ((3, 1), (3, 3))),
            ({"x": 3, "y": 3}, ((3, 1), (3, 3))),
            ({"x": 3}, ((3, 1), (2, 2, 2))),
            ({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
        ]:
            rechunked = self.lazy_var.chunk(chunks)
            assert rechunked.chunks == expected
            self.assertLazyAndIdentical(self.eager_var, rechunked)

            expected_chunksizes = dict(zip(self.lazy_var.dims, expected, strict=True))
            assert rechunked.chunksizes == expected_chunksizes

    def test_indexing(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u[0], v[0])
        self.assertLazyAndIdentical(u[:1], v[:1])
        self.assertLazyAndIdentical(u[[0, 1], [0, 1, 2]], v[[0, 1], [0, 1, 2]])

    @pytest.mark.parametrize(
        "expected_data, index",
        [
            (da.array([99, 2, 3, 4]), 0),
            (da.array([99, 99, 99, 4]), slice(2, None, -1)),
            (da.array([99, 99, 3, 99]), [0, -1, 1]),
            (da.array([99, 99, 99, 4]), np.arange(3)),
            (da.array([1, 99, 99, 99]), [False, True, True, True]),
            (da.array([1, 99, 99, 99]), np.array([False, True, True, True])),
            (da.array([99, 99, 99, 99]), Variable(("x"), np.array([True] * 4))),
        ],
    )
    def test_setitem_dask_array(self, expected_data, index):
        arr = Variable(("x"), da.array([1, 2, 3, 4]))
        expected = Variable(("x"), expected_data)
        with raise_if_dask_computes():
            arr[index] = 99
        assert_identical(arr, expected)

    def test_squeeze(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u[0].squeeze(), v[0].squeeze())

    def test_equals(self):
        v = self.lazy_var
        assert v.equals(v)
        assert isinstance(v.data, da.Array)
        assert v.identical(v)
        assert isinstance(v.data, da.Array)

    def test_transpose(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u.T, v.T)

    def test_shift(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u.shift(x=2), v.shift(x=2))
        self.assertLazyAndIdentical(u.shift(x=-2), v.shift(x=-2))
        assert v.data.chunks == v.shift(x=1).data.chunks

    def test_roll(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u.roll(x=2), v.roll(x=2))
        assert v.data.chunks == v.roll(x=1).data.chunks

    def test_unary_op(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(-u, -v)
        self.assertLazyAndIdentical(abs(u), abs(v))
        self.assertLazyAndIdentical(u.round(), v.round())

    def test_binary_op(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(2 * u, 2 * v)
        self.assertLazyAndIdentical(u + u, v + v)
        self.assertLazyAndIdentical(u[0] + u, v[0] + v)

    def test_binary_op_bitshift(self) -> None:
        # bit shifts only work on ints so we need to generate
        # new eager and lazy vars
        rng = np.random.default_rng(0)
        values = rng.integers(low=-10000, high=10000, size=(4, 6))
        data = da.from_array(values, chunks=(2, 2))
        u = Variable(("x", "y"), values)
        v = Variable(("x", "y"), data)
        self.assertLazyAndIdentical(u << 2, v << 2)
        self.assertLazyAndIdentical(u << 5, v << 5)
        self.assertLazyAndIdentical(u >> 2, v >> 2)
        self.assertLazyAndIdentical(u >> 5, v >> 5)

    def test_repr(self):
        expected = dedent(
            f"""\
            <xarray.Variable (x: 4, y: 6)> Size: 192B
            {self.lazy_var.data!r}"""
        )
        assert expected == repr(self.lazy_var)

    def test_pickle(self):
        # Test that pickling/unpickling does not convert the dask
        # backend to numpy
        a1 = Variable(["x"], build_dask_array("x"))
        a1.compute()
        assert not a1._in_memory
        assert kernel_call_count == 1
        a2 = pickle.loads(pickle.dumps(a1))
        assert kernel_call_count == 1
        assert_identical(a1, a2)
        assert not a1._in_memory
        assert not a2._in_memory

    def test_reduce(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(u.std(), v.std())
        with raise_if_dask_computes():
            actual = v.argmax(dim="x")
        self.assertLazyAndAllClose(u.argmax(dim="x"), actual)
        with raise_if_dask_computes():
            actual = v.argmin(dim="x")
        self.assertLazyAndAllClose(u.argmin(dim="x"), actual)
        self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
        self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x"))
        with pytest.raises(NotImplementedError, match=r"only works along an axis"):
            v.median()
        with pytest.raises(NotImplementedError, match=r"only works along an axis"):
            v.median(v.dims)
        with raise_if_dask_computes():
            v.reduce(duck_array_ops.mean)

    def test_missing_values(self):
        values = np.array([0, 1, np.nan, 3])
        data = da.from_array(values, chunks=(2,))

        eager_var = Variable("x", values)
        lazy_var = Variable("x", data)
        self.assertLazyAndIdentical(eager_var, lazy_var.fillna(lazy_var))
        self.assertLazyAndIdentical(Variable("x", range(4)), lazy_var.fillna(2))
        self.assertLazyAndIdentical(eager_var.count(), lazy_var.count())

    def test_concat(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndIdentical(u, Variable.concat([v[:2], v[2:]], "x"))
        self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], v[1]], "x"))
        self.assertLazyAndIdentical(u[:2], Variable.concat([u[0], v[1]], "x"))
        self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], u[1]], "x"))
        self.assertLazyAndIdentical(
            u[:3], Variable.concat([v[[0, 2]], v[[1]]], "x", positions=[[0, 2], [1]])
        )

    def test_missing_methods(self):
        v = self.lazy_var
        try:
            v.argsort()
        except NotImplementedError as err:
            assert "dask" in str(err)
        try:
            v[0].item()
        except NotImplementedError as err:
            assert "dask" in str(err)

    def test_univariate_ufunc(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndAllClose(np.sin(u), np.sin(v))

    def test_bivariate_ufunc(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(v, 0))
        self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(0, v))

    def test_univariate_xufunc(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_bivariate_xufunc(self):
        u = self.eager_var
        v = self.lazy_var
        self.assertLazyAndAllClose(np.maximum(u, 0), xu.maximum(v, 0))
        self.assertLazyAndAllClose(np.maximum(u, 0), xu.maximum(0, v))

    def test_compute(self):
        u = self.eager_var
        v = self.lazy_var

        assert dask.is_dask_collection(v)
        (v2,) = dask.compute(v + 1)
        assert not dask.is_dask_collection(v2)

        assert ((u + 1).data == v2.data).all()

    def test_persist(self):
        u = self.eager_var
        v = self.lazy_var + 1

        (v2,) = dask.persist(v)
        assert v is not v2
        assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
        assert v2.__dask_keys__() == v.__dask_keys__()
        assert dask.is_dask_collection(v)
        assert dask.is_dask_collection(v2)

        self.assertLazyAndAllClose(u + 1, v)
        self.assertLazyAndAllClose(u + 1, v2)

    @requires_pint
    def test_tokenize_duck_dask_array(self):
        import pint

        unit_registry = pint.UnitRegistry()

        q = unit_registry.Quantity(self.data, "meter")
        variable = xr.Variable(("x", "y"), q)

        token = dask.base.tokenize(variable)
        post_op = variable + 5 * unit_registry.meter

        assert dask.base.tokenize(variable) != dask.base.tokenize(post_op)
        # Immutability check
        assert dask.base.tokenize(variable) == token


class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_identical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_allclose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_equal)

    @pytest.fixture(autouse=True)
    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(
            self.values, coords={"x": range(4)}, dims=("x", "y"), name="foo"
        )
        self.lazy_array = DataArray(
            self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo"
        )

    def test_chunk(self) -> None:
        for chunks, expected in [
            ({}, ((2, 2), (2, 2, 2))),
            (3, ((3, 1), (3, 3))),
            ({"x": 3, "y": 3}, ((3, 1), (3, 3))),
            ({"x": 3}, ((3, 1), (2, 2, 2))),
            ({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
            ({"x": "16B"}, ((1, 1, 1, 1), (2, 2, 2))),
            ("16B", ((1, 1, 1, 1), (1,) * 6)),
            ("16MB", ((4,), (6,))),
        ]:
            # Test DataArray
            rechunked = self.lazy_array.chunk(chunks)
            assert rechunked.chunks == expected
            self.assertLazyAndIdentical(self.eager_array, rechunked)

            expected_chunksizes = dict(zip(self.lazy_array.dims, expected, strict=True))
            assert rechunked.chunksizes == expected_chunksizes

            # Test Dataset
            lazy_dataset = self.lazy_array.to_dataset()
            eager_dataset = self.eager_array.to_dataset()
            expected_chunksizes = dict(zip(lazy_dataset.dims, expected, strict=True))
            rechunked = lazy_dataset.chunk(chunks)

            # Dataset.chunks has a different return type to DataArray.chunks - see issue #5843
            assert rechunked.chunks == expected_chunksizes
            self.assertLazyAndIdentical(eager_dataset, rechunked)

            assert rechunked.chunksizes == expected_chunksizes

    def test_rechunk(self):
        chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2})
        assert chunked.chunks == ((2,) * 2, (2,) * 3)
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        assert chunked.data.name.startswith("xarray-<this-array>")

    def test_lazy_dataset(self):
        lazy_ds = Dataset({"foo": (("x", "y"), self.data)})
        assert isinstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], "x")
        self.assertLazyAndAllClose(u, actual)

    def test_compute(self):
        u = self.eager_array
        v = self.lazy_array

        assert dask.is_dask_collection(v)
        (v2,) = dask.compute(v + 1)
        assert not dask.is_dask_collection(v2)

        assert ((u + 1).data == v2.data).all()

    def test_persist(self):
        u = self.eager_array
        v = self.lazy_array + 1

        (v2,) = dask.persist(v)
        assert v is not v2
        assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
        assert v2.__dask_keys__() == v.__dask_keys__()
        assert dask.is_dask_collection(v)
        assert dask.is_dask_collection(v2)

        self.assertLazyAndAllClose(u + 1, v)
        self.assertLazyAndAllClose(u + 1, v2)

    def test_concat_loads_variables(self):
        # Test that concat() computes not-in-memory variables at most once
        # and loads them in the output, while leaving the input unaltered.
        d1 = build_dask_array("d1")
        c1 = build_dask_array("c1")
        d2 = build_dask_array("d2")
        c2 = build_dask_array("c2")
        d3 = build_dask_array("d3")
        c3 = build_dask_array("c3")
        # Note: c is a non-index coord.
        # Index coords are loaded by IndexVariable.__init__.
        ds1 = Dataset(data_vars={"d": ("x", d1)}, coords={"c": ("x", c1)})
        ds2 = Dataset(data_vars={"d": ("x", d2)}, coords={"c": ("x", c2)})
        ds3 = Dataset(data_vars={"d": ("x", d3)}, coords={"c": ("x", c3)})

        assert kernel_call_count == 0
        out = xr.concat(
            [ds1, ds2, ds3], dim="n", data_vars="different", coords="different"
        )
        # each kernel is computed exactly once
        assert kernel_call_count == 6
        # variables are loaded in the output
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3], dim="n", data_vars="all", coords="all")
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=["d"], coords=["c"])
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=[], coords=[])
        # variables are loaded once as we are validating that they're identical
        assert kernel_call_count == 12
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        out = xr.concat(
            [ds1, ds2, ds3],
            dim="n",
            data_vars="different",
            coords="different",
            compat="identical",
        )
        # compat=identical doesn't do any more kernel calls than compat=equals
        assert kernel_call_count == 18
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        # When the test for different turns true halfway through,
        # stop computing variables as it would not have any benefit
        ds4 = Dataset(data_vars={"d": ("x", [2.0])}, coords={"c": ("x", [2.0])})
        out = xr.concat(
            [ds1, ds2, ds4, ds3], dim="n", data_vars="different", coords="different"
        )
        # the variables of ds1 and ds2 were computed, but those of ds3 didn't
        assert kernel_call_count == 22
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)
        # the data of ds1 and ds2 was loaded into numpy and then
        # concatenated to the data of ds3. Thus, only ds3 is computed now.
        out.compute()
        assert kernel_call_count == 24

        # Finally, test that originals are unaltered
        assert ds1["d"].data is d1
        assert ds1["c"].data is c1
        assert ds2["d"].data is d2
        assert ds2["c"].data is c2
        assert ds3["d"].data is d3
        assert ds3["c"].data is c3

        # now check that concat() is correctly using dask name equality to skip loads
        out = xr.concat(
            [ds1, ds1, ds1], dim="n", data_vars="different", coords="different"
        )
        assert kernel_call_count == 24
        # variables are not loaded in the output
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat(
            [ds1, ds1, ds1], dim="n", data_vars=[], coords=[], compat="identical"
        )
        assert kernel_call_count == 24
        # variables are not loaded in the output
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat(
            [ds1, ds2.compute(), ds3],
            dim="n",
            data_vars="all",
            coords="different",
            compat="identical",
        )
        # c1,c3 must be computed for comparison since c2 is numpy;
        # d2 is computed too
        assert kernel_call_count == 28

        out = xr.concat(
            [ds1, ds2.compute(), ds3],
            dim="n",
            data_vars="all",
            coords="all",
            compat="identical",
        )
        # no extra computes
        assert kernel_call_count == 30

        # Finally, test that originals are unaltered
        assert ds1["d"].data is d1
        assert ds1["c"].data is c1
        assert ds2["d"].data is d2
        assert ds2["c"].data is c2
        assert ds3["d"].data is d3
        assert ds3["c"].data is c3

    def test_groupby(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby("x").mean(...)
        with raise_if_dask_computes():
            actual = v.groupby("x").mean(...)
        self.assertLazyAndAllClose(expected, actual)

    def test_rolling(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.rolling(x=2).mean()
        with raise_if_dask_computes():
            actual = v.rolling(x=2).mean()
        self.assertLazyAndAllClose(expected, actual)

    @pytest.mark.parametrize("func", ["first", "last"])
    def test_groupby_first_last(self, func):
        method = operator.methodcaller(func)
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords["ab"] = ("x", ["a", "a", "b", "b"])
        expected = method(u.groupby("ab"))

        with raise_if_dask_computes():
            actual = method(v.groupby("ab"))
        self.assertLazyAndAllClose(expected, actual)

        with raise_if_dask_computes():
            actual = method(v.groupby("ab"))
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [
            {"x": [2, 3, 4]},
            {"x": [1, 100, 2, 101, 3]},
            {"x": [2.5, 3, 3.5], "y": [2, 2.5, 3]},
        ]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u["x"])
        self.assertLazyAndEqual(expected, v.to_dataset("x").to_dataarray("x"))

    def test_merge(self):
        def duplicate_and_merge(array):
            return xr.merge([array, array.rename("bar")]).to_dataarray()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), np.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        ds.load(scheduler=counting_get)

        assert count[0] == 1

    def test_duplicate_dims(self):
        data = np.random.normal(size=(4, 4))
        with pytest.warns(UserWarning, match="Duplicate dimension"):
            arr = DataArray(data, dims=("x", "x"))
        with pytest.warns(UserWarning, match="Duplicate dimension"):
            chunked_array = arr.chunk({"x": 2})
        assert chunked_array.chunks == ((2, 2), (2, 2))
        assert chunked_array.chunksizes == {"x": (2, 2)}

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=("w", "x", "y"))
        stacked = arr.stack(z=("x", "y"))
        z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=["x", "y"])
        expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_dataarray_repr(self):
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        expected = dedent(
            f"""\
            <xarray.DataArray 'data' (x: 1)> Size: 8B
            {data!r}
            Coordinates:
                y        (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>
            Dimensions without coordinates: x"""
        )
        assert expected == repr(a)
        assert kernel_call_count == 0  # should not evaluate dask array

    def test_dataset_repr(self):
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
        expected = dedent(
            """\
            <xarray.Dataset> Size: 16B
            Dimensions:  (x: 1)
            Coordinates:
                y        (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>
            Dimensions without coordinates: x
            Data variables:
                a        (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>"""
        )
        assert expected == repr(ds)
        assert kernel_call_count == 0  # should not evaluate dask array

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a1 = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        a1.compute()
        assert not a1._in_memory
        assert not a1.coords["y"]._in_memory
        assert kernel_call_count == 2
        a2 = pickle.loads(pickle.dumps(a1))
        assert kernel_call_count == 2
        assert_identical(a1, a2)
        assert not a1._in_memory
        assert not a2._in_memory
        assert not a1.coords["y"]._in_memory
        assert not a2.coords["y"]._in_memory

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds1 = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
        ds1.compute()
        assert not ds1["a"]._in_memory
        assert not ds1["y"]._in_memory
        assert kernel_call_count == 2
        ds2 = pickle.loads(pickle.dumps(ds1))
        assert kernel_call_count == 2
        assert_identical(ds1, ds2)
        assert not ds1["a"]._in_memory
        assert not ds2["a"]._in_memory
        assert not ds1["y"]._in_memory
        assert not ds2["y"]._in_memory

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        with suppress(AttributeError):
            _ = a.NOTEXIST
        assert kernel_call_count == 0

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
        with suppress(AttributeError):
            _ = ds.NOTEXIST
        assert kernel_call_count == 0

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        assert not a._in_memory
        assert a.values.tolist() == [1, 2]
        assert not a._in_memory

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable, coords={"x": range(4)}, name="foo")
        self.assertLazyAndIdentical(self.lazy_array, a)

    @requires_pint
    def test_tokenize_duck_dask_array(self):
        import pint

        unit_registry = pint.UnitRegistry()

        q = unit_registry.Quantity(self.data, unit_registry.meter)
        data_array = xr.DataArray(
            data=q, coords={"x": range(4)}, dims=("x", "y"), name="foo"
        )

        token = dask.base.tokenize(data_array)
        post_op = data_array + 5 * unit_registry.meter

        assert dask.base.tokenize(data_array) != dask.base.tokenize(post_op)
        # Immutability check
        assert dask.base.tokenize(data_array) == token


class TestToDaskDataFrame:
    @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/11584")
    def test_to_dask_dataframe(self):
        # Test conversion of Datasets to dask DataFrames
        x = np.random.randn(10)
        y = np.arange(10, dtype="uint8")
        t = list("abcdefghij")

        ds = Dataset(
            {"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)}
        )

        expected_pd = pd.DataFrame({"a": x, "b": y}, index=pd.Index(t, name="t"))

        # test if 1-D index is correctly set up
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        # test if we have dask dataframes
        assert isinstance(actual, dd.DataFrame)

        # use the .equals from pandas to check dataframes are equivalent
        assert_frame_equal(actual.compute(), expected.compute())

        # test if no index is given
        expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4)

        actual = ds.to_dask_dataframe(set_index=False)

        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(actual.compute(), expected.compute())

    @pytest.mark.xfail(
        reason="Currently pandas with pyarrow installed will return a `string[pyarrow]` type, "
        "which causes the `y` column to have a different type depending on whether pyarrow is installed"
    )
    def test_to_dask_dataframe_2D(self):
        # Test if 2-D dataset is supplied
        w = np.random.randn(2, 3)
        ds = Dataset({"w": (("x", "y"), da.from_array(w, chunks=(1, 2)))})
        ds["x"] = ("x", np.array([0, 1], np.int64))
        ds["y"] = ("y", list("abc"))

        # dask dataframes do not (yet) support multiindex,
        # but when it does, this would be the expected index:
        exp_index = pd.MultiIndex.from_arrays(
            [[0, 0, 0, 1, 1, 1], ["a", "b", "c", "a", "b", "c"]], names=["x", "y"]
        )
        expected = pd.DataFrame({"w": w.reshape(-1)}, index=exp_index)
        # so for now, reset the index
        expected = expected.reset_index(drop=False)
        actual = ds.to_dask_dataframe(set_index=False)

        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(actual.compute(), expected)

    @pytest.mark.xfail(raises=NotImplementedError)
    def test_to_dask_dataframe_2D_set_index(self):
        # This will fail until dask implements MultiIndex support
        w = da.from_array(np.random.randn(2, 3), chunks=(1, 2))
        ds = Dataset({"w": (("x", "y"), w)})
        ds["x"] = ("x", np.array([0, 1], np.int64))
        ds["y"] = ("y", list("abc"))

        expected = ds.compute().to_dataframe()
        actual = ds.to_dask_dataframe(set_index=True)
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

    def test_to_dask_dataframe_coordinates(self):
        # Test if coordinate is also a dask array
        x = np.random.randn(10)
        t = np.arange(10) * 2

        ds = Dataset(
            {
                "a": ("t", da.from_array(x, chunks=4)),
                "t": ("t", da.from_array(t, chunks=4)),
            }
        )

        expected_pd = pd.DataFrame({"a": x}, index=pd.Index(t, name="t"))
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())

    @pytest.mark.xfail(
        reason="Currently pandas with pyarrow installed will return a `string[pyarrow]` type, "
        "which causes the index to have a different type depending on whether pyarrow is installed"
    )
    def test_to_dask_dataframe_not_daskarray(self):
        # Test if DataArray is not a dask array
        x = np.random.randn(10)
        y = np.arange(10, dtype="uint8")
        t = list("abcdefghij")

        ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t)})

        expected = pd.DataFrame({"a": x, "b": y}, index=pd.Index(t, name="t"))

        actual = ds.to_dask_dataframe(set_index=True)
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

    def test_to_dask_dataframe_no_coordinate(self):
        x = da.from_array(np.random.randn(10), chunks=4)
        ds = Dataset({"x": ("dim_0", x)})

        expected = ds.compute().to_dataframe().reset_index()
        actual = ds.to_dask_dataframe()
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

        expected = ds.compute().to_dataframe()
        actual = ds.to_dask_dataframe(set_index=True)
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

    def test_to_dask_dataframe_dim_order(self):
        values = np.array([[1, 2], [3, 4]], dtype=np.int64)
        ds = Dataset({"w": (("x", "y"), values)}).chunk(1)

        expected = ds["w"].to_series().reset_index()
        actual = ds.to_dask_dataframe(dim_order=["x", "y"])
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

        expected = ds["w"].T.to_series().reset_index()
        actual = ds.to_dask_dataframe(dim_order=["y", "x"])
        assert isinstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

        with pytest.raises(ValueError, match=r"does not match the set of dimensions"):
            ds.to_dask_dataframe(dim_order=["x"])


@pytest.mark.parametrize("method", ["load", "compute"])
def test_dask_kwargs_variable(method):
    chunked_array = da.from_array(np.arange(3), chunks=(2,))
    x = Variable("y", chunked_array)
    # args should be passed on to dask.compute() (via DaskManager.compute())
    with mock.patch.object(da, "compute", return_value=(np.arange(3),)) as mock_compute:
        getattr(x, method)(foo="bar")
    mock_compute.assert_called_with(chunked_array, foo="bar")


@pytest.mark.parametrize("method", ["load", "compute", "persist"])
def test_dask_kwargs_dataarray(method):
    data = da.from_array(np.arange(3), chunks=(2,))
    x = DataArray(data)
    if method in ["load", "compute"]:
        dask_func = "dask.array.compute"
    else:
        dask_func = "dask.persist"
    # args should be passed on to "dask_func"
    with mock.patch(dask_func) as mock_func:
        getattr(x, method)(foo="bar")
    mock_func.assert_called_with(data, foo="bar")


@pytest.mark.parametrize("method", ["load", "compute", "persist"])
def test_dask_kwargs_dataset(method):
    data = da.from_array(np.arange(3), chunks=(2,))
    x = Dataset({"x": (("y"), data)})
    if method in ["load", "compute"]:
        dask_func = "dask.array.compute"
    else:
        dask_func = "dask.persist"
    # args should be passed on to "dask_func"
    with mock.patch(dask_func) as mock_func:
        getattr(x, method)(foo="bar")
    mock_func.assert_called_with(data, foo="bar")


kernel_call_count = 0


def kernel(name):
    """Dask kernel to test pickling/unpickling and __repr__.
    Must be global to make it pickleable.
    """
    global kernel_call_count
    kernel_call_count += 1
    return np.ones(1, dtype=np.int64)


def build_dask_array(name):
    global kernel_call_count
    kernel_call_count = 0
    return dask.array.Array(
        dask={(name, 0): (kernel, name)}, name=name, chunks=((1,),), dtype=np.int64
    )


@pytest.mark.parametrize(
    "persist", [lambda x: x.persist(), lambda x: dask.persist(x)[0]]
)
def test_persist_Dataset(persist):
    ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()
    ds = ds + 1
    n = len(ds.foo.data.dask)

    ds2 = persist(ds)

    assert len(ds2.foo.data.dask) == 1
    assert len(ds.foo.data.dask) == n  # doesn't mutate in place


@pytest.mark.parametrize(
    "persist", [lambda x: x.persist(), lambda x: dask.persist(x)[0]]
)
def test_persist_DataArray(persist):
    x = da.arange(10, chunks=(5,))
    y = DataArray(x)
    z = y + 1
    n = len(z.data.dask)

    zz = persist(z)

    assert len(z.data.dask) == n
    assert len(zz.data.dask) == zz.data.npartitions


def test_dataarray_with_dask_coords():
    import toolz

    x = xr.Variable("x", da.arange(8, chunks=(4,)))
    y = xr.Variable("y", da.arange(8, chunks=(4,)) * 2)
    data = da.random.random((8, 8), chunks=(4, 4)) + 1
    array = xr.DataArray(data, dims=["x", "y"])
    array.coords["xx"] = x
    array.coords["yy"] = y

    assert dict(array.__dask_graph__()) == toolz.merge(
        data.__dask_graph__(), x.__dask_graph__(), y.__dask_graph__()
    )

    (array2,) = dask.compute(array)
    assert not dask.is_dask_collection(array2)

    assert all(isinstance(v._variable.data, np.ndarray) for v in array2.coords.values())


def test_basic_compute():
    ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk({"x": 2})
    for get in [dask.threaded.get, dask.multiprocessing.get, dask.local.get_sync, None]:
        with dask.config.set(scheduler=get):
            ds.compute()
            ds.foo.compute()
            ds.foo.variable.compute()


def test_dask_layers_and_dependencies():
    ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()

    x = dask.delayed(ds)
    assert set(x.__dask_graph__().dependencies).issuperset(
        ds.__dask_graph__().dependencies
    )
    assert set(x.foo.__dask_graph__().dependencies).issuperset(
        ds.__dask_graph__().dependencies
    )


def make_da():
    da = xr.DataArray(
        np.ones((10, 20)),
        dims=["x", "y"],
        coords={"x": np.arange(10), "y": np.arange(100, 120)},
        name="a",
    ).chunk({"x": 4, "y": 5})
    da.x.attrs["long_name"] = "x"
    da.attrs["test"] = "test"
    da.coords["c2"] = 0.5
    da.coords["ndcoord"] = da.x * 2
    da.coords["cxy"] = (da.x * da.y).chunk({"x": 4, "y": 5})

    return da


def make_ds():
    map_ds = xr.Dataset()
    map_ds["a"] = make_da()
    map_ds["b"] = map_ds.a + 50
    map_ds["c"] = map_ds.x + 20
    map_ds = map_ds.chunk({"x": 4, "y": 5})
    map_ds["d"] = ("z", [1, 1, 1, 1])
    map_ds["z"] = [0, 1, 2, 3]
    map_ds["e"] = map_ds.x + map_ds.y
    map_ds.coords["c1"] = 0.5
    map_ds.coords["cx"] = ("x", np.arange(len(map_ds.x)))
    map_ds.coords["cx"].attrs["test2"] = "test2"
    map_ds.attrs["test"] = "test"
    map_ds.coords["xx"] = map_ds["a"] * map_ds.y

    map_ds.x.attrs["long_name"] = "x"
    map_ds.y.attrs["long_name"] = "y"

    return map_ds


# fixtures cannot be used in parametrize statements
# instead use this workaround
# https://docs.pytest.org/en/latest/deprecations.html#calling-fixtures-directly
@pytest.fixture
def map_da():
    return make_da()


@pytest.fixture
def map_ds():
    return make_ds()


def test_unify_chunks(map_ds):
    ds_copy = map_ds.copy()
    ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10})

    with pytest.raises(ValueError, match=r"inconsistent chunks"):
        _ = ds_copy.chunks

    expected_chunks = {"x": (4, 4, 2), "y": (5, 5, 5, 5)}
    with raise_if_dask_computes():
        actual_chunks = ds_copy.unify_chunks().chunks
    assert actual_chunks == expected_chunks
    assert_identical(map_ds, ds_copy.unify_chunks())

    out_a, out_b = xr.unify_chunks(ds_copy.cxy, ds_copy.drop_vars("cxy"))
    assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5))
    assert out_b.chunks == expected_chunks

    # Test unordered dims
    da = ds_copy["cxy"]
    out_a, out_b = xr.unify_chunks(da.chunk({"x": -1}), da.T.chunk({"y": -1}))
    assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5))
    assert out_b.chunks == ((5, 5, 5, 5), (4, 4, 2))

    # Test mismatch
    with pytest.raises(ValueError, match=r"Dimension 'x' size mismatch: 10 != 2"):
        xr.unify_chunks(da, da.isel(x=slice(2)))


@pytest.mark.parametrize("obj", [make_ds(), make_da()])
@pytest.mark.parametrize(
    "transform", [lambda x: x.compute(), lambda x: x.unify_chunks()]
)
def test_unify_chunks_shallow_copy(obj, transform):
    obj = transform(obj)
    unified = obj.unify_chunks()
    assert_identical(obj, unified) and obj is not obj.unify_chunks()


@pytest.mark.parametrize("obj", [make_da()])
def test_auto_chunk_da(obj):
    actual = obj.chunk("auto").data
    expected = obj.data.rechunk("auto")
    np.testing.assert_array_equal(actual, expected)
    assert actual.chunks == expected.chunks


def test_map_blocks_error(map_da, map_ds):
    def bad_func(darray):
        return (darray * darray.x + 5 * darray.y)[:1, :1]

    with pytest.raises(ValueError, match=r"Received dimension 'x' of length 1"):
        xr.map_blocks(bad_func, map_da).compute()

    def returns_numpy(darray):
        return (darray * darray.x + 5 * darray.y).values

    with pytest.raises(TypeError, match=r"Function must return an xarray DataArray"):
        xr.map_blocks(returns_numpy, map_da)

    with pytest.raises(TypeError, match=r"args must be"):
        xr.map_blocks(operator.add, map_da, args=10)

    with pytest.raises(TypeError, match=r"kwargs must be"):
        xr.map_blocks(operator.add, map_da, args=[10], kwargs=[20])

    def really_bad_func(darray):
        raise ValueError("couldn't do anything.")

    with pytest.raises(Exception, match=r"Cannot infer"):
        xr.map_blocks(really_bad_func, map_da)

    ds_copy = map_ds.copy()
    ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10})

    with pytest.raises(ValueError, match=r"inconsistent chunks"):
        xr.map_blocks(bad_func, ds_copy)

    with pytest.raises(TypeError, match=r"Cannot pass dask collections"):
        xr.map_blocks(bad_func, map_da, kwargs=dict(a=map_da.chunk()))


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks(obj):
    def func(obj):
        result = obj + obj.x + 5 * obj.y
        return result

    with raise_if_dask_computes():
        actual = xr.map_blocks(func, obj)
    expected = func(obj)
    assert_chunks_equal(expected.chunk(), actual)
    assert_identical(actual, expected)


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_mixed_type_inputs(obj):
    def func(obj1, non_xarray_input, obj2):
        result = obj1 + obj1.x + 5 * obj1.y
        return result

    with raise_if_dask_computes():
        actual = xr.map_blocks(func, obj, args=["non_xarray_input", obj])
    expected = func(obj, "non_xarray_input", obj)
    assert_chunks_equal(expected.chunk(), actual)
    assert_identical(actual, expected)


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_convert_args_to_list(obj):
    expected = obj + 10
    with raise_if_dask_computes():
        actual = xr.map_blocks(operator.add, obj, [10])
    assert_chunks_equal(expected.chunk(), actual)
    assert_identical(actual, expected)


def test_map_blocks_dask_args():
    da1 = xr.DataArray(
        np.ones((10, 20)),
        dims=["x", "y"],
        coords={"x": np.arange(10), "y": np.arange(20)},
    ).chunk({"x": 5, "y": 4})

    # check that block shapes are the same
    def sumda(da1, da2):
        assert da1.shape == da2.shape
        return da1 + da2

    da2 = da1 + 1
    with raise_if_dask_computes():
        mapped = xr.map_blocks(sumda, da1, args=[da2])
    xr.testing.assert_equal(da1 + da2, mapped)

    # one dimension in common
    da2 = (da1 + 1).isel(x=1, drop=True)
    with raise_if_dask_computes():
        mapped = xr.map_blocks(operator.add, da1, args=[da2])
    xr.testing.assert_equal(da1 + da2, mapped)

    # test that everything works when dimension names are different
    da2 = (da1 + 1).isel(x=1, drop=True).rename({"y": "k"})
    with raise_if_dask_computes():
        mapped = xr.map_blocks(operator.add, da1, args=[da2])
    xr.testing.assert_equal(da1 + da2, mapped)

    with pytest.raises(ValueError, match=r"Chunk sizes along dimension 'x'"):
        xr.map_blocks(operator.add, da1, args=[da1.chunk({"x": 1})])

    with pytest.raises(ValueError, match=r"cannot align.*index.*are not equal"):
        xr.map_blocks(operator.add, da1, args=[da1.reindex(x=np.arange(20))])

    # reduction
    da1 = da1.chunk({"x": -1})
    da2 = da1 + 1
    with raise_if_dask_computes():
        mapped = xr.map_blocks(lambda a, b: (a + b).sum("x"), da1, args=[da2])
    xr.testing.assert_equal((da1 + da2).sum("x"), mapped)

    # reduction with template
    da1 = da1.chunk({"x": -1})
    da2 = da1 + 1
    with raise_if_dask_computes():
        mapped = xr.map_blocks(
            lambda a, b: (a + b).sum("x"), da1, args=[da2], template=da1.sum("x")
        )
    xr.testing.assert_equal((da1 + da2).sum("x"), mapped)

    # bad template: not chunked
    with pytest.raises(ValueError, match="Provided template has no dask arrays"):
        xr.map_blocks(
            lambda a, b: (a + b).sum("x"),
            da1,
            args=[da2],
            template=da1.sum("x").compute(),
        )


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_add_attrs(obj):
    def add_attrs(obj):
        obj = obj.copy(deep=True)
        obj.attrs["new"] = "new"
        obj.cxy.attrs["new2"] = "new2"
        return obj

    expected = add_attrs(obj)
    with raise_if_dask_computes():
        actual = xr.map_blocks(add_attrs, obj)

    assert_identical(actual, expected)

    # when template is specified, attrs are copied from template, not set by function
    with raise_if_dask_computes():
        actual = xr.map_blocks(add_attrs, obj, template=obj)
    assert_identical(actual, obj)


def test_map_blocks_change_name(map_da):
    def change_name(obj):
        obj = obj.copy(deep=True)
        obj.name = "new"
        return obj

    expected = change_name(map_da)
    with raise_if_dask_computes():
        actual = xr.map_blocks(change_name, map_da)

    assert_identical(actual, expected)


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_kwargs(obj):
    expected = xr.full_like(obj, fill_value=np.nan)
    with raise_if_dask_computes():
        actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan))
    assert_chunks_equal(expected.chunk(), actual)
    assert_identical(actual, expected)


def test_map_blocks_to_dataarray(map_ds):
    with raise_if_dask_computes():
        actual = xr.map_blocks(lambda x: x.to_dataarray(), map_ds)

    # to_dataarray does not preserve name, so cannot use assert_identical
    assert_equal(actual, map_ds.to_dataarray())


@pytest.mark.parametrize(
    "func",
    [
        lambda x: x,
        lambda x: x.to_dataset(),
        lambda x: x.drop_vars("x"),
        lambda x: x.expand_dims(k=[1, 2, 3]),
        lambda x: x.expand_dims(k=3),
        lambda x: x.assign_coords(new_coord=("y", x.y.data * 2)),
        lambda x: x.astype(np.int32),
        lambda x: x.x,
    ],
)
def test_map_blocks_da_transformations(func, map_da):
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, map_da)

    assert_identical(actual, func(map_da))


@pytest.mark.parametrize(
    "func",
    [
        lambda x: x,
        lambda x: x.drop_vars("cxy"),
        lambda x: x.drop_vars("a"),
        lambda x: x.drop_vars("x"),
        lambda x: x.expand_dims(k=[1, 2, 3]),
        lambda x: x.expand_dims(k=3),
        lambda x: x.rename({"a": "new1", "b": "new2"}),
        lambda x: x.x,
    ],
)
def test_map_blocks_ds_transformations(func, map_ds):
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, map_ds)

    assert_identical(actual, func(map_ds))


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_da_ds_with_template(obj):
    func = lambda x: x.isel(x=[1])
    # a simple .isel(x=[1, 5, 9]) puts all those in a single chunk.
    template = xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x")
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, obj, template=template)
    assert_identical(actual, template)

    # Check that indexes are written into the graph directly
    dsk = dict(actual.__dask_graph__())
    assert len({k for k in dsk if "x-coordinate" in k})
    assert all(
        isinstance(v, PandasIndex) for k, v in dsk.items() if "x-coordinate" in k
    )

    with raise_if_dask_computes():
        actual = obj.map_blocks(func, template=template)
    assert_identical(actual, template)


def test_map_blocks_roundtrip_string_index():
    ds = xr.Dataset(
        {"data": (["label"], [1, 2, 3])}, coords={"label": ["foo", "bar", "baz"]}
    ).chunk(label=1)
    assert ds.label.dtype == np.dtype("=U3")

    mapped = ds.map_blocks(lambda x: x, template=ds)
    assert mapped.label.dtype == ds.label.dtype

    mapped = ds.map_blocks(lambda x: x, template=None)
    assert mapped.label.dtype == ds.label.dtype

    mapped = ds.data.map_blocks(lambda x: x, template=ds.data)
    assert mapped.label.dtype == ds.label.dtype

    mapped = ds.data.map_blocks(lambda x: x, template=None)
    assert mapped.label.dtype == ds.label.dtype


def test_map_blocks_template_convert_object():
    da = make_da()
    ds = da.to_dataset()

    func = lambda x: x.to_dataset().isel(x=[1])
    template = xr.concat([da.to_dataset().isel(x=[i]) for i in [1, 5, 9]], dim="x")
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, da, template=template)
    assert_identical(actual, template)

    func = lambda x: x.to_dataarray().isel(x=[1])
    template = xr.concat([ds.to_dataarray().isel(x=[i]) for i in [1, 5, 9]], dim="x")
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, ds, template=template)
    assert_identical(actual, template)


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_errors_bad_template(obj):
    with pytest.raises(ValueError, match=r"unexpected coordinate variables"):
        xr.map_blocks(lambda x: x.assign_coords(a=10), obj, template=obj).compute()
    with pytest.raises(ValueError, match=r"does not contain coordinate variables"):
        xr.map_blocks(lambda x: x.drop_vars("cxy"), obj, template=obj).compute()
    with pytest.raises(ValueError, match=r"Dimensions {'x'} missing"):
        xr.map_blocks(lambda x: x.isel(x=1), obj, template=obj).compute()
    with pytest.raises(ValueError, match=r"Received dimension 'x' of length 1"):
        xr.map_blocks(lambda x: x.isel(x=[1]), obj, template=obj).compute()
    with pytest.raises(TypeError, match=r"must be a DataArray"):
        xr.map_blocks(lambda x: x.isel(x=[1]), obj, template=(obj,)).compute()
    with pytest.raises(ValueError, match=r"map_blocks requires that one block"):
        xr.map_blocks(
            lambda x: x.isel(x=[1]).assign_coords(x=10), obj, template=obj.isel(x=[1])
        ).compute()
    with pytest.raises(ValueError, match=r"Expected index 'x' to be"):
        xr.map_blocks(
            lambda a: a.isel(x=[1]).assign_coords(x=[120]),  # assign bad index values
            obj,
            template=xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x"),
        ).compute()


def test_map_blocks_errors_bad_template_2(map_ds):
    with pytest.raises(ValueError, match=r"unexpected data variables {'xyz'}"):
        xr.map_blocks(lambda x: x.assign(xyz=1), map_ds, template=map_ds).compute()


@pytest.mark.parametrize("obj", [make_da(), make_ds()])
def test_map_blocks_object_method(obj):
    def func(obj):
        result = obj + obj.x + 5 * obj.y
        return result

    with raise_if_dask_computes():
        expected = xr.map_blocks(func, obj)
        actual = obj.map_blocks(func)

    assert_identical(expected, actual)


def test_map_blocks_hlg_layers():
    # regression test for #3599
    ds = xr.Dataset(
        {
            "x": (("a",), dask.array.ones(10, chunks=(5,))),
            "z": (("b",), dask.array.ones(10, chunks=(5,))),
        }
    )
    mapped = ds.map_blocks(lambda x: x)

    xr.testing.assert_equal(mapped, ds)


def test_make_meta(map_ds):
    from xarray.core.parallel import make_meta

    meta = make_meta(map_ds)

    for variable in map_ds._coord_names:
        assert variable in meta._coord_names
        assert meta.coords[variable].shape == (0,) * meta.coords[variable].ndim

    for variable in map_ds.data_vars:
        assert variable in meta.data_vars
        assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim


def test_identical_coords_no_computes():
    lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
    a = xr.DataArray(
        da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
    )
    b = xr.DataArray(
        da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
    )
    with raise_if_dask_computes():
        c = a + b
    assert_identical(c, a)


@pytest.mark.parametrize(
    "obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
)
@pytest.mark.parametrize(
    "transform",
    [
        lambda x: x.reset_coords(),
        lambda x: x.reset_coords(drop=True),
        lambda x: x.isel(x=1),
        lambda x: x.attrs.update(new_attrs=1),
        lambda x: x.assign_coords(cxy=1),
        lambda x: x.rename({"x": "xnew"}),
        lambda x: x.rename({"cxy": "cxynew"}),
    ],
)
def test_token_changes_on_transform(obj, transform):
    with raise_if_dask_computes():
        assert dask.base.tokenize(obj) != dask.base.tokenize(transform(obj))


@pytest.mark.parametrize(
    "obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
)
def test_token_changes_when_data_changes(obj):
    with raise_if_dask_computes():
        t1 = dask.base.tokenize(obj)

    # Change data_var
    if isinstance(obj, DataArray):
        obj *= 2
    else:
        obj["a"] *= 2
    with raise_if_dask_computes():
        t2 = dask.base.tokenize(obj)
    assert t2 != t1

    # Change non-index coord
    obj.coords["ndcoord"] *= 2
    with raise_if_dask_computes():
        t3 = dask.base.tokenize(obj)
    assert t3 != t2

    # Change IndexVariable
    obj = obj.assign_coords(x=obj.x * 2)
    with raise_if_dask_computes():
        t4 = dask.base.tokenize(obj)
    assert t4 != t3


@pytest.mark.parametrize("obj", [make_da().compute(), make_ds().compute()])
def test_token_changes_when_buffer_changes(obj):
    with raise_if_dask_computes():
        t1 = dask.base.tokenize(obj)

    if isinstance(obj, DataArray):
        obj[0, 0] = 123
    else:
        obj["a"][0, 0] = 123
    with raise_if_dask_computes():
        t2 = dask.base.tokenize(obj)
    assert t2 != t1

    obj.coords["ndcoord"][0] = 123
    with raise_if_dask_computes():
        t3 = dask.base.tokenize(obj)
    assert t3 != t2


@pytest.mark.parametrize(
    "transform",
    [lambda x: x, lambda x: x.copy(deep=False), lambda x: x.copy(deep=True)],
)
@pytest.mark.parametrize("obj", [make_da(), make_ds(), make_ds().variables["a"]])
def test_token_identical(obj, transform):
    with raise_if_dask_computes():
        assert dask.base.tokenize(obj) == dask.base.tokenize(transform(obj))
    assert dask.base.tokenize(obj.compute()) == dask.base.tokenize(
        transform(obj.compute())
    )


@pytest.mark.parametrize(
    "obj",
    [
        make_ds(),  # Dataset
        make_ds().variables["c2"],  # Variable
        make_ds().variables["x"],  # IndexVariable
    ],
)
def test_tokenize_empty_attrs(obj):
    """Issues #6970 and #8788"""
    obj.attrs = {}
    assert obj._attrs is None
    a = dask.base.tokenize(obj)

    assert obj.attrs == {}
    assert obj._attrs == {}  # attrs getter changed None to dict
    b = dask.base.tokenize(obj)
    assert a == b

    obj2 = obj.copy()
    c = dask.base.tokenize(obj2)
    assert a == c


def test_recursive_token():
    """Test that tokenization is invoked recursively, and doesn't just rely on the
    output of str()
    """
    a = np.ones(10000)
    b = np.ones(10000)
    b[5000] = 2
    assert str(a) == str(b)
    assert dask.base.tokenize(a) != dask.base.tokenize(b)

    # Test DataArray and Variable
    da_a = DataArray(a)
    da_b = DataArray(b)
    assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b)

    # Test Dataset
    ds_a = da_a.to_dataset(name="x")
    ds_b = da_b.to_dataset(name="x")
    assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b)

    # Test IndexVariable
    da_a = DataArray(a, dims=["x"], coords={"x": a})
    da_b = DataArray(a, dims=["x"], coords={"x": b})
    assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b)


@requires_scipy_or_netCDF4
def test_normalize_token_with_backend(map_ds):
    with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file:
        map_ds.to_netcdf(tmp_file)
        read = xr.open_dataset(tmp_file)
        assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read)
        read.close()


@pytest.mark.parametrize(
    "compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
)
def test_lazy_array_equiv_variables(compat):
    var1 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
    var2 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
    var3 = xr.Variable(("y", "x"), da.zeros((20, 10), chunks=2))

    with raise_if_dask_computes():
        assert getattr(var1, compat)(var2, equiv=lazy_array_equiv)
    # values are actually equal, but we don't know that till we compute, return None
    with raise_if_dask_computes():
        assert getattr(var1, compat)(var2 / 2, equiv=lazy_array_equiv) is None

    # shapes are not equal, return False without computes
    with raise_if_dask_computes():
        assert getattr(var1, compat)(var3, equiv=lazy_array_equiv) is False

    # if one or both arrays are numpy, return None
    assert getattr(var1, compat)(var2.compute(), equiv=lazy_array_equiv) is None
    assert (
        getattr(var1.compute(), compat)(var2.compute(), equiv=lazy_array_equiv) is None
    )

    with raise_if_dask_computes():
        assert getattr(var1, compat)(var2.transpose("y", "x"))


@pytest.mark.parametrize(
    "compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
)
def test_lazy_array_equiv_merge(compat):
    da1 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
    da2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
    da3 = xr.DataArray(da.ones((20, 10), chunks=2), dims=("y", "x"))

    with raise_if_dask_computes():
        xr.merge([da1, da2], compat=compat)
    # shapes are not equal; no computes necessary
    with raise_if_dask_computes(max_computes=0):
        with pytest.raises(ValueError):
            xr.merge([da1, da3], compat=compat)
    with raise_if_dask_computes(max_computes=2):
        xr.merge([da1, da2 / 2], compat=compat)


@pytest.mark.filterwarnings("ignore::FutureWarning")  # transpose_coords
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
@pytest.mark.parametrize(
    "transform",
    [
        lambda a: a.assign_attrs(new_attr="anew"),
        lambda a: a.assign_coords(cxy=a.cxy),
        lambda a: a.copy(),
        lambda a: a.isel(x=slice(None)),
        lambda a: a.loc[dict(x=slice(None))],
        lambda a: a.transpose(...),
        lambda a: a.squeeze(),  # no dimensions to squeeze
        lambda a: a.reindex(x=a.x),
        lambda a: a.reindex_like(a),
        lambda a: a.rename({"cxy": "cnew"}).rename({"cnew": "cxy"}),
        lambda a: a.pipe(lambda x: x),
        lambda a: xr.align(a, xr.zeros_like(a))[0],
        # assign
        # swap_dims
        # set_index / reset_index
    ],
)
def test_transforms_pass_lazy_array_equiv(obj, transform):
    with raise_if_dask_computes():
        assert_equal(obj, transform(obj))


def test_more_transforms_pass_lazy_array_equiv(map_da, map_ds):
    with raise_if_dask_computes():
        assert_equal(map_ds.cxy.broadcast_like(map_ds.cxy), map_ds.cxy)
        assert_equal(xr.broadcast(map_ds.cxy, map_ds.cxy)[0], map_ds.cxy)
        assert_equal(map_ds.map(lambda x: x), map_ds)
        assert_equal(map_ds.set_coords("a").reset_coords("a"), map_ds)
        assert_equal(map_ds.assign({"a": map_ds.a}), map_ds)

        # fails because of index error
        # assert_equal(
        #     map_ds.rename_dims({"x": "xnew"}).rename_dims({"xnew": "x"}), map_ds
        # )

        assert_equal(
            map_ds.rename_vars({"cxy": "cnew"}).rename_vars({"cnew": "cxy"}), map_ds
        )

        assert_equal(map_da._from_temp_dataset(map_da._to_temp_dataset()), map_da)
        assert_equal(map_da.astype(map_da.dtype), map_da)
        assert_equal(map_da.transpose("y", "x", transpose_coords=False).cxy, map_da.cxy)


def test_optimize():
    # https://github.com/pydata/xarray/issues/3698
    a = dask.array.ones((10, 4), chunks=(5, 2))
    arr = xr.DataArray(a).chunk(5)
    (arr2,) = dask.optimize(arr)
    arr2.compute()


def test_graph_manipulation():
    """dask.graph_manipulation passes an optional parameter, "rename", to the rebuilder
    function returned by __dask_postperist__; also, the dsk passed to the rebuilder is
    a HighLevelGraph whereas with dask.persist() and dask.optimize() it's a plain dict.
    """
    import dask.graph_manipulation as gm

    v = Variable(["x"], [1, 2]).chunk(-1).chunk(1) * 2
    da = DataArray(v)
    ds = Dataset({"d1": v[0], "d2": v[1], "d3": ("x", [3, 4])})

    v2, da2, ds2 = gm.clone(v, da, ds)

    assert_equal(v2, v)
    assert_equal(da2, da)
    assert_equal(ds2, ds)

    for a, b in ((v, v2), (da, da2), (ds, ds2)):
        assert a.__dask_layers__() != b.__dask_layers__()
        assert len(a.__dask_layers__()) == len(b.__dask_layers__())
        assert a.__dask_graph__().keys() != b.__dask_graph__().keys()
        assert len(a.__dask_graph__()) == len(b.__dask_graph__())
        assert a.__dask_graph__().layers.keys() != b.__dask_graph__().layers.keys()
        assert len(a.__dask_graph__().layers) == len(b.__dask_graph__().layers)

    # Above we performed a slice operation; adding the two slices back together creates
    # a diamond-shaped dependency graph, which in turn will trigger a collision in layer
    # names if we were to use HighLevelGraph.cull() instead of
    # HighLevelGraph.cull_layers() in Dataset.__dask_postpersist__().
    assert_equal(ds2.d1 + ds2.d2, ds.d1 + ds.d2)


def test_new_index_var_computes_once():
    # regression test for GH1533
    data = dask.array.from_array(np.array([100, 200]))
    with raise_if_dask_computes(max_computes=1):
        Dataset(coords={"z": ("z", data)})


def test_minimize_graph_size():
    # regression test for https://github.com/pydata/xarray/issues/8409
    ds = Dataset(
        {
            "foo": (
                ("x", "y", "z"),
                dask.array.ones((120, 120, 120), chunks=(20, 20, 1)),
            )
        },
        coords={"x": np.arange(120), "y": np.arange(120), "z": np.arange(120)},
    )

    mapped = ds.map_blocks(lambda x: x)
    graph = dict(mapped.__dask_graph__())

    numchunks = {k: len(v) for k, v in ds.chunksizes.items()}
    for var in "xyz":
        actual = len([key for key in graph if var in key[0]])
        # assert that we only include each chunk of an index variable
        # is only included once, not the product of number of chunks of
        # all the other dimensions.
        # e.g. previously for 'x',  actual == numchunks['y'] * numchunks['z']
        assert actual == numchunks[var], (actual, numchunks[var])


def test_idxmin_chunking():
    # GH9425
    x, y, t = 100, 100, 10
    rang = np.arange(t * x * y)
    da = xr.DataArray(
        rang.reshape(t, x, y), coords={"time": range(t), "x": range(x), "y": range(y)}
    )
    da = da.chunk(dict(time=-1, x=25, y=25))
    actual = da.idxmin("time")
    assert actual.chunksizes == {k: da.chunksizes[k] for k in ["x", "y"]}
    assert_identical(actual, da.compute().idxmin("time"))