File: depth_first_evaluation.test_slow

package info (click to toggle)
duckdb 1.5.1-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 564
file content (219 lines) | stat: -rw-r--r-- 6,383 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# name: test/sql/parallelism/intraquery/depth_first_evaluation.test_slow
# description: Test that query plans are evaluated in a depth-first fashion
# group: [intraquery]

mode skip

# we need a persistent DB because we want to compress the table that we're working with
load __TEST_DIR__/depth_first_evaluation.db

# we don't want any disk spilling because we're testing memory pressure
statement ok
SET temp_directory = ''

# 2GiB is pretty tight, each single aggregation should take only slightly less
# in this test, we're testing that we don't have multiple aggregations active simultaneously
# so this limit should be tight enough
statement ok
SET memory_limit = '2GiB'

statement ok
SET threads = 4

# 10M integers but the table is tiny because of delta compression
statement ok
CREATE TABLE integers AS SELECT range i FROM range(10_000_000)

# one of these should easily fit in memory
query I
SELECT count(*) c FROM (SELECT DISTINCT i FROM integers)
----
10000000

# the next query performs 10 of the same distinct aggregations and unions them together
# each distinct aggregation has a different limit (which doesn't do anything)
# so that this test is future-proof (in case DuckDB does any common sub-plan elimination in the future)

# the idea here is that if DuckDB would do breadth-first plan evaluation (like it did before)
# DuckDB would first perform the 'Sink' for every distinct aggregation one by one
# this would create a HUGE temporary intermediates
# only after that DuckDB would perform the 'Finalize' for every distinct aggregation one by one
# the 'Finalize' reduces the data size to a single row
# so, this used to throw an OOM exception given the current memory limit

# with depth-first plan evaluation, DuckDB performs 'Finalize' for every distinct aggregation,
# before starting 'Sink' on the next distinct aggregation
# now this query completes without much memory pressure!
query I
SELECT sum(c)
FROM (
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_000)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_001)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_002)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_003)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_004)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_005)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_006)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_007)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_008)
    UNION ALL
    SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_009)
)
----
100000000

statement ok
DROP TABLE integers

# column i has 0, 100, 200, etc., around 100 unique values spread out across the range 0 to 10 million
# all other values in column j are equal to range + 0.5
# column j and k are just ranges from 0 to 10 million
# we have to do this so our statistics propagation and dynamic join filters don't trivialise the query
statement ok
CREATE TABLE doubles AS
SELECT
    CASE WHEN range % 100_000 = 0 THEN range ELSE range + 0.5 END i,
    range::DOUBLE j,
    range::DOUBLE k0,
    range::DOUBLE k1,
    range::DOUBLE k2,
    range::DOUBLE k3,
    range::DOUBLE k4,
    range::DOUBLE k5,
    range::DOUBLE k6,
    range::DOUBLE k7,
    range::DOUBLE k8,
    range::DOUBLE k9
FROM range(10_000_000)

# one of these should always fit in memory
# the idea is that the cte is a large join (10m x 10m)
# but it's really selective, only 100 tuples come out of it

# then, we join with doubles union'ed with itself, so that it becomes the probe pipeline,
# i.e., it has a higher cardinality than the selective join, which goes into a build
query I
WITH c AS NOT MATERIALIZED (
    SELECT d0.k0
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
)
SELECT count(*)
FROM (
    SELECT * FROM doubles
    UNION ALL
    SELECT * FROM doubles
) d
JOIN c
ON (d.k0 = c.k0)
----
200

# now we just crank up the number of ctes that we're joining with to 10

# again, if DuckDB would do breadth-first plan evaluation (like it did before)
# DuckDB would 'Sink' into all of of the builds in the cte's one by one, creating huge intermediates
# only after that it would perform all the selective joins and reduce the size of the intermediates
# so, this used to throw an OOM exception

# with depth-first plan evaluation, DuckDB performs the selective joins one by one,
# reducing the size of intermediates immediately, and the query completes!
query I
WITH c0 AS NOT MATERIALIZED (
    SELECT d0.k0
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_000
), c1 AS NOT MATERIALIZED (
    SELECT d0.k1
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_001
), c2 AS NOT MATERIALIZED (
    SELECT d0.k2
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_002
), c3 AS NOT MATERIALIZED (
    SELECT d0.k3
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_003
), c4 AS NOT MATERIALIZED (
    SELECT d0.k4
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_004
), c5 AS NOT MATERIALIZED (
    SELECT d0.k5
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_005
), c6 AS NOT MATERIALIZED (
    SELECT d0.k6
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_006
), c7 AS NOT MATERIALIZED (
    SELECT d0.k7
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_007
), c8 AS NOT MATERIALIZED (
    SELECT d0.k8
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_008
), c9 AS NOT MATERIALIZED (
    SELECT d0.k9
    FROM doubles d0
    JOIN doubles d1
    ON (d0.i = d1.j)
    LIMIT 100_000_009
)
SELECT count(*)
FROM (
    SELECT * FROM doubles
    UNION ALL
    SELECT * FROM doubles
) d
JOIN c0
ON (d.k0 = c0.k0)
JOIN c1
ON (d.k1 = c1.k1)
JOIN c2
ON (d.k2 = c2.k2)
JOIN c3
ON (d.k3 = c3.k3)
JOIN c4
ON (d.k4 = c4.k4)
JOIN c5
ON (d.k5 = c5.k5)
JOIN c6
ON (d.k6 = c6.k6)
JOIN c7
ON (d.k7 = c7.k7)
JOIN c8
ON (d.k8 = c8.k8)
JOIN c9
ON (d.k9 = c9.k9)
----
200