File: row_groups_per_file.test

package info (click to toggle)
duckdb 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 558
file content (160 lines) | stat: -rw-r--r-- 4,806 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# name: test/sql/copy/row_groups_per_file.test
# description: test ROW_GROUPS_PER_FILE parameter for parquet COPY
# group: [copy]

require parquet

require vector_size 2048

statement ok
CREATE TABLE bigdata AS SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);

statement ok
set threads=1

# creates a new file after every chunk, 5 chunks in total, so 5 parquet files
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file1' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file1/*.parquet'
----
10000

query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file1/*.parquet')
----
5

# new file after every other chunk, 3 in total
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file2' (FORMAT PARQUET, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file2/*.parquet'
----
10000

query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file2/*.parquet')
----
3

# new row group after every chunk, new file after every 2 row groups, again 3 in total
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file3' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file3/*.parquet'
----
10000

query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file3/*.parquet')
----
3

# file rotation with return stats
query IIIIII
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_stats/' (FORMAT PARQUET, WRITE_EMPTY_FILE false, FILENAME_PATTERN '{uuid}', ROW_GROUP_SIZE 3000, ROW_GROUPS_PER_FILE 2, RETURN_STATS);
----
<REGEX>:.*row_groups_per_file_stats.*[a-zA-Z0-9-]{36}.parquet	<REGEX>:\d+	<REGEX>:\d+	<REGEX>:\d+	<REGEX>:{'"col_a"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}, '"col_b"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}}	NULL
<REGEX>:.*row_groups_per_file_stats.*[a-zA-Z0-9-]{36}.parquet	<REGEX>:\d+	<REGEX>:\d+	<REGEX>:\d+	<REGEX>:{'"col_a"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}, '"col_b"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}}	NULL

# now we crank up the threads
statement ok
PRAGMA verify_parallelism

statement ok
set threads=4

# with multiple threads writing to the same file this is best-effort
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file4' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file4/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file4/*.parquet')
----
true

statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file5' (FORMAT PARQUET, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file5/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file5/*.parquet')
----
true

statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file6' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file6/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file6/*.parquet')
----
true

# should work nicely with PER_THREAD_OUTPUT
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file7' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file7/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file7/*.parquet')
----
true

statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file8' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file8/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file8/*.parquet')
----
true

statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file9' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)

query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file9/*.parquet'
----
10000

query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file9/*.parquet')
----
true

# doesn't work in combination with certain params
statement error
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_error' (FORMAT PARQUET, ROW_GROUPS_PER_FILE 1, USE_TMP_FILE TRUE);
----
Not implemented Error

statement error
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_error' (FORMAT PARQUET, ROW_GROUPS_PER_FILE 1, PARTITION_BY col_a);
----
Not implemented Error