File: format_uuid.test

package info (click to toggle)
duckdb 1.5.1-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 564
file content (235 lines) | stat: -rw-r--r-- 6,686 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# name: test/sql/copy/format_uuid.test
# description: basic tests for the hive partitioned write
# group: [copy]

require parquet

require vector_size 512

statement ok
PRAGMA verify_parallelism;

statement ok
PRAGMA threads=4;

statement ok
CREATE TABLE test2 as SELECT i as a, (i*2) as b, power(i,2) as c from range(0,10) tbl(i);

statement ok
CREATE TABLE test3 as SELECT i as a, (i*3) as b, power(i,3) as c from range(0,10) tbl(i);

statement ok
CREATE TABLE test4 as SELECT i as a, (i*4) as b, power(i,4) as c from range(0,10) tbl(i);

statement ok
CREATE TABLE test5 as SELECT i as a, (i*5) as b, power(i,5) as c from range(0,10) tbl(i);

statement ok
CREATE TABLE testpto as SELECT i as a, (i*10) as b, (i*100) as c from range(0,10000) tbl(i);

query III
SELECT * FROM test2;
----
0	0	0
1	2	1
2	4	4
3	6	9
4	8	16
5	10	25
6	12	36
7	14	49
8	16	64
9	18	81    

# Test with the FILENAME_PATTERN option. "{uuid}" will be replaced with a unique string.
statement ok
COPY test2 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "leading_{uuid}");

# the filename contains a uuid
query III
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????.parquet';
----
18	81	9

statement ok
COPY test2 TO '__TEST_DIR__/trailing' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "{uuid}_trailing");

# the filename contains a uuid
query III
SELECT * FROM '__TEST_DIR__/trailing/a=9/????????-????-4???-????-????????????_trailing.parquet';
----
18	81	9

statement ok
COPY test2 TO '__TEST_DIR__/only_offset' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "{i}");

query III
SELECT * FROM '__TEST_DIR__/only_offset/a=9/?.parquet';
----
18	81	9

# create a second file in the same partition dirs. This is the append workflow
statement ok
COPY test3 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "leading_{uuid}_trailing");

# the filename contains a uuid
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????_trailing.parquet';
----
27	729	9

# the partition dir contains the files from previous 2 queries
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????*.parquet';
----
18	81	9
27	729	9

# Test without a specified format name for the outputfile.
statement ok
COPY test4 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE);

# the filename is the default: "data_{i}"
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/data_[0-9]*.parquet';
----
36	6561	9

# the partition dir contains the files from test[2-4]
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet';
----
18	81	9
27	729	9
36	6561	9

# Test where the FILENAME_PATTERN does not contain "{i}" or "{uuid}". 
statement ok
COPY test5 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "basename");

# the file has the name of the given input
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/basename[0-9]*.parquet';
----
45	59049	9

# the partition dir contains the files from test[2-5]
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet';
----
18	81	9
27	729	9
36	6561	9
45	59049	9

# Test without the overwrite_or_ignore param, that tries to add a file to an existing directory
statement error
COPY test5 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a));
----
Directory

# Test where the FILENAME_PATTERN param has no value
statement error
COPY test4 TO '__TEST_DIR__/to_be_overwritten' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN);
----
Copy option "FILENAME_PATTERN" requires an argument of type VARCHAR

statement ok
COPY test4 TO '__TEST_DIR__/to_be_overwritten' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN a_file_name);

query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten/a=9/a_file_name*');
----
1

query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten/a=9/a_file_name*.parquet';
----
36	6561	9

# Test with a combination of {i} and {uuid}
statement ok
COPY test5 TO '__TEST_DIR__/incorrect_pos' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name{i}_with_{uuid}_numbers");

query III sort
SELECT * FROM '__TEST_DIR__/incorrect_pos/a=5/a_name?_with_????????-????-4???-????-????????????_numbers.parquet';
----
25	3125	5

# Test UUID v7
statement ok
COPY test5 TO '__TEST_DIR__/uuid_v7' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name_with_{uuidv7}");

query III sort
SELECT * FROM '__TEST_DIR__/uuid_v7/a=5/a_name_with_????????-????-7???-????-????????????.parquet';
----
25	3125	5

# Test UUID v7 mixed with UUID v4
statement ok
COPY test5 TO '__TEST_DIR__/mixed_uuid_types' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name{uuid}_with_{uuidv7}_numbers");

query III sort
SELECT * FROM '__TEST_DIR__/mixed_uuid_types/a=5/a_name????????-????-4???-????-????????????_with_????????-????-7???-????-????????????_numbers.parquet';
----
25	3125	5


# Test "{uuid}" with per_thread_output TRUE
statement ok
COPY testpto TO '__TEST_DIR__/pto' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, OVERWRITE_OR_IGNORE TRUE, FILENAME_PATTERN "basename_{uuid}");

query I sort
SELECT COUNT(*) FROM '__TEST_DIR__/pto/basename_????????-????-4???-????-????????????.parquet';
----
10000

query I
SELECT COUNT(*) >= 1 FROM GLOB('__TEST_DIR__/pto/basename_????????-????-4???-????-????????????.parquet');
----
true

statement ok
PRAGMA threads=1;

# Test where overwrite_or_ignore param has no value -> will be true -> overwrite previous test
statement ok
COPY test5 TO '__TEST_DIR__/to_be_overwritten2' (FORMAT PARQUET, PARTITION_BY (a), OVERWRITE_OR_IGNORE);

query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*');
----
1

query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet';
----
45	59049	9

# Test where overwrite_or_ignore param == true, and filename_pattern is set to an existing file
statement ok
COPY test5 TO '__TEST_DIR__/directory0' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "my_filename");

query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/directory0/a=7/*');
----
1

query III sort
SELECT * FROM '__TEST_DIR__/directory0/a=7/my_filename0.parquet';
----
35	16807	7

# When partition columns are written to files, `select *` returns in order of columns
statement ok
COPY test5 TO '__TEST_DIR__/to_be_overwritten2' (FORMAT PARQUET, PARTITION_BY (a), OVERWRITE_OR_IGNORE, WRITE_PARTITION_COLUMNS);

query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*');
----
1

query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet';
----
9	45	59049