File: compressed_materialization.test_slow

package info (click to toggle)
duckdb 1.5.1-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 564
file content (238 lines) | stat: -rw-r--r-- 7,262 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# name: test/optimizer/compressed_materialization.test_slow
# description: Compressed materialization test
# group: [optimizer]

statement ok
pragma enable_verification

statement ok
PRAGMA explain_output = OPTIMIZED_ONLY

# these functions live in the catalog, but cannot be called directly
statement error
select __internal_compress_string_utinyint('L')
----
Binder Error: Compressed materialization functions are for internal use only!

# internal issue 1576
statement ok
create table t0 as select range%400000 a, range%400000 b from range(500000);

query III rowsort
select * from (
      select *, row_number() OVER () as row_number from (
          SELECT * FROM t0 ORDER BY 1) ta
      ) tb where b > 2
  order by a limit 2;
----
3	3	7
3	3	8

# tricky tests taken from test/sql/subquery/scalar/test_issue_6136.test
# we run these with one thread since they are order dependent
statement ok
create table r as select * from values (1, 1, 'a', 'A'), (1, null, 'b', 'B'), (1, 2, 'c', 'C'), (2, null, 'd', 'D') t(ra, rb, x, y);

statement ok
create table b as select * from values (1, 1, 1), (2, 1, 2), (3, 1, 3), (4, 1, null), (5, 2, 1), (6, 2, null), (7, 99, 99) t(id, ba, bb);

statement ok
set threads=1

query T
select (
    select {'x': first(x order by x), 'y': first(y order by y), '__matches': count(*)}
    from (
        select *
        from r
        where ba = ra
          and (bb = rb or rb is null)
        order by all
    )
    group by ra, rb
    order by all
    limit 1)
from b
order by all
----
{'x': a, 'y': A, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
NULL

query T
select
  coalesce((select {'x': first(x), 'y': first(y), '__matches': count(*)} from r where ba = ra and (bb = rb or rb is null) group by ra, rb order by bb = rb limit 1), {'x': null, 'y': null, '__matches': 0}) as ref2
from b
----
{'x': a, 'y': A, '__matches': 1}
{'x': c, 'y': C, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': NULL, 'y': NULL, '__matches': 0}

statement ok
set threads=4

# we should see compress twice (in the ORDER BY expression and payload) and decompress once (just the payload)
statement ok
create table t1 as select range i from range(10)

query II
explain select i from t1 order by 10-i
----
logical_opt	<REGEX>:(.*__internal_decompress.*){1}(.*__internal_compress.*){2}

statement ok
create table test as
select (range + 7) % 4 i,
       (range + 7) % 11 j
from range(10)

# should see compress exactly twice (for columns i and j)
# if we see less than twice we're not compressing,
# and if we see it more than twice we're likely compressing and decompressing twice (once for each ORDER BY)
# but we can compress once, then do both ORDER BYs, then decompress
query II
explain select count(i), count(j) from (select i, j from (select i, j from test order by j offset 1) order by j offset 1)
----
logical_opt	<REGEX>:(.*__internal_compress.*){2}

# should see it exactly once here, as we can only compress the group (i), not the value being summed (j)
# after the GROUP BY we do the ORDER BY, and finally decompress
query II
explain select i, sum(j) from test group by i order by i
----
logical_opt	<REGEX>:(.*__internal_compress.*){1}

# We can't deal with duplicate projections (yet) so this should see 3 compresses instead of 1
query II
explain select count(j1), count(j2) from (select j j1, j j2 from (select j from test order by j offset 1) order by j1, j2 offset 1)
----
logical_opt	<REGEX>:(.*__internal_compress.*){3}

query II
explain select distinct i, j from test order by i, j
----
logical_opt	<REGEX>:(.*__internal_compress.*){2}

# taken from third_party/sqllogictest/test/index/orderby_nosort/10/slt_good_27.test
# the problem was that statistics propagation created an index join after "filter_prune" happened
statement ok
CREATE TABLE tab3(pk INTEGER PRIMARY KEY, col0 INTEGER, col1 FLOAT, col2 TEXT, col3 INTEGER, col4 FLOAT, col5 TEXT)

statement ok
INSERT INTO tab3 VALUES
    (0,461,479.93,'idmdh',456,464.90,'nczyk'),
    (1,473,482.60,'bguxh',460,466.25,'oseln'),
    (2,474,484.45,'bnzmd',461,467.13,'kvwna'),
    (3,475,485.1,'obtlj',462,468.73,'jkjbo'),
    (4,477,486.62,'gjtbr',463,469.9,'bhers'),
    (5,479,489.59,'bkxfm',464,470.29,'aklru'),
    (6,481,495.30,'owirt',466,471.55,'lysig'),
    (7,482,496.31,'yergm',467,473.31,'rkpxn'),
    (8,484,497.51,'fszui',468,474.44,'ztexm'),
    (9,486,498.24,'eueji',469,477.28,'amvcc')

statement ok
CREATE UNIQUE INDEX idx_tab3_4 ON tab3 (col3)

query I
SELECT pk FROM tab3 WHERE col0 IN (SELECT col3 FROM tab3 WHERE (col1 > 93.79)) ORDER BY 1 DESC
----
0

# test that we compress all-NULL (from multiple Parquet files) to utinyint too (if union_by_name is true)
require parquet

#
statement ok
pragma disable_verification

# one column without NULL, and two columns (varchar and bigint) that are all NULL
statement ok
copy (select hash(range + 1) i, null::varchar j, null::bigint k from range(100)) to '__TEST_DIR__/cm1.parquet'

statement ok
copy (select hash(range + 1) i, null::varchar j, null::bigint k from range(100,200)) to '__TEST_DIR__/cm2.parquet'

# has NULL, and does not have non-NULL
query II
select
    stats(j) LIKE '%[Has Null: true, Has No Null: false]%',
    stats(k) LIKE '%[Has Null: true, Has No Null: false]%'
from read_parquet('__TEST_DIR__/cm*.parquet', union_by_name=true) limit 1
----
true	true

# this should lead to a plan where both all-NULL columns (varchar j and bigint k) are compressed
statement ok
PRAGMA explain_output = PHYSICAL_ONLY

query II
explain select * from read_parquet('__TEST_DIR__/cm*.parquet', union_by_name=true) order by i
----
physical_plan	<REGEX>:.*__internal_decompress.*__internal_decompress.*__internal_compress.*__internal_compress.*

# and of course some tpch stuff

require tpch

statement ok
call dbgen(sf=0.01)

statement ok
PRAGMA explain_output = PHYSICAL_ONLY

# tpch q1 should use perfect hash aggregate
query II
EXPLAIN
SELECT
    l_returnflag,
    l_linestatus,
    sum(l_quantity) AS sum_qty,
    sum(l_extendedprice) AS sum_base_price,
    sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    count(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
----
physical_plan	<REGEX>:.*PERFECT_HASH_GROUP_BY.*

statement ok
PRAGMA explain_output = OPTIMIZED_ONLY

# test that we're compressing lineitem
query II
explain select * from lineitem order by l_shipdate
----
logical_opt	<REGEX>:.*__internal_decompress.*__internal_compress.*

# test that we get the same result with and without compressed materialization
query IIIIIIIIIIIIIII nosort q0
select * from lineitem order by l_shipdate
----

statement ok
set disabled_optimizers to 'compressed_materialization'

query IIIIIIIIIIIIIII nosort q0
select * from lineitem order by l_shipdate
----