File: file_size_bytes.test

package info (click to toggle)
duckdb 1.5.1-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 564
file content (162 lines) | stat: -rw-r--r-- 4,862 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# name: test/sql/copy/file_size_bytes.test
# description: test FILE_SIZE_BYTES parameter for COPY
# group: [copy]

# different vector sizes result in different number of files
require no_vector_verification

statement ok
CREATE TABLE bigdata AS SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);

statement ok
set threads=1

# parameter in bytes
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv1' (FORMAT CSV, FILE_SIZE_BYTES 1000);

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv1/*.csv')
----
10000

# should lead to 3 files
query I
SELECT count(*) FROM glob('__TEST_DIR__/file_size_bytes_csv1/*.csv')
----
3

# parameter in human-readable format
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv2' (FORMAT CSV, FILE_SIZE_BYTES '1kb');

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv2/*.csv')
----
10000

# same, should also lead to 3 files
query I
SELECT count(*) FROM glob('__TEST_DIR__/file_size_bytes_csv2/*.csv')
----
3

# We just do a simple test for JSON/Parquet, because FILE_SIZE_BYTES is handled outside of the format implementations
require json

statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_json' (FORMAT JSON, FILE_SIZE_BYTES '1kb');

query I
SELECT COUNT(*) FROM read_json_auto('__TEST_DIR__/file_size_bytes_json/*.json')
----
10000

# JSON is a bit more wasteful, 5 files
query I
SELECT count(*) >= 3 and count(*) <= 12 FROM glob('__TEST_DIR__/file_size_bytes_json/*.json')
----
true

require parquet

# we can trigger early flushes with Parquet by setting a low row group size
# also, we have to use hash(col)::DOUBLE for Parquet because we compress integers by A LOT which affects the file_size_bytes
statement ok
COPY (SELECT hash(col_a)::DOUBLE, hash(col_b)::DOUBLE FROM bigdata) TO '__TEST_DIR__/file_size_bytes_parquet' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, FILE_SIZE_BYTES '1kb');

query I
SELECT COUNT(*) FROM read_parquet('__TEST_DIR__/file_size_bytes_parquet/*.parquet')
----
10000

# we get one file per chunk, for a total of 5
query I
SELECT count(*) FROM glob('__TEST_DIR__/file_size_bytes_parquet/*.parquet')
----
5

# 10x the input, now 100k
statement ok
INSERT INTO bigdata SELECT bigdata.* FROM bigdata, range(9)

# now we crank up the threads
statement ok
PRAGMA verify_parallelism;

statement ok
pragma threads=4;

# with parallelism we do best-effort to get to the specified file size, but we are more likely to overshoot
# this file would be ~950MB if written to a single file, what if we set a 500kb limit
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv3' (FORMAT CSV, FILE_SIZE_BYTES '500kb');

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv3/*.csv')
----
100000

# there should always be 2 files, even with parallelism nondeterminism
query I
SELECT count(*) FROM glob('__TEST_DIR__/file_size_bytes_csv3/*.csv')
----
2

# what about a 200kb limit, which is <1/4th of the total file size
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv4' (FORMAT CSV, FILE_SIZE_BYTES '200kb');

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv4/*.csv')
----
100000

# there should be around 4 files, not exact due to parallelism
query I
SELECT count(*) BETWEEN 3 AND 5 FROM glob('__TEST_DIR__/file_size_bytes_csv4/*.csv')
----
1

# should work nicely with per thread output
# no thread should see more than 700kb (being lenient in case of thread imbalance here),
# so this should yield one file per thread
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv5' (FORMAT CSV, FILE_SIZE_BYTES '700kb', PER_THREAD_OUTPUT TRUE);

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv5/*.csv')
----
100000

# should yield multiple files, can be less than 4 because we create files lazily
query I
SELECT count(*) > 1 FROM glob('__TEST_DIR__/file_size_bytes_csv5/*.csv')
----
true

# each thread sees ~240kb if it's balanced, what about a 190kb limit
statement ok
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv6' (FORMAT CSV, FILE_SIZE_BYTES '190kb', PER_THREAD_OUTPUT TRUE);

query I
SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv6/*.csv')
----
100000

# ~2 files per thread, around 8 in total (5 output files in case of extreme thread imbalance and only 1 thread runs)
query I
SELECT count(*) BETWEEN 5 AND 10 FROM glob('__TEST_DIR__/file_size_bytes_csv6/*.csv')
----
1

# doesn't work in combination with certain params
statement error
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv7' (FORMAT CSV, FILE_SIZE_BYTES '190kb', USE_TMP_FILE TRUE);
----
Not implemented Error

statement error
COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv7' (FORMAT CSV, FILE_SIZE_BYTES '190kb', PARTITION_BY col_a);
----
Not implemented Error