1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
# name: test/sql/copy/row_groups_per_file.test
# description: test ROW_GROUPS_PER_FILE parameter for parquet COPY
# group: [copy]
require parquet
require vector_size 2048
statement ok
CREATE TABLE bigdata AS SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);
statement ok
set threads=1
# creates a new file after every chunk, 5 chunks in total, so 5 parquet files
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file1' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file1/*.parquet'
----
10000
query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file1/*.parquet')
----
5
# new file after every other chunk, 3 in total
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file2' (FORMAT PARQUET, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file2/*.parquet'
----
10000
query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file2/*.parquet')
----
3
# new row group after every chunk, new file after every 2 row groups, again 3 in total
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file3' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file3/*.parquet'
----
10000
query I
SELECT count(*) FROM glob('__TEST_DIR__/row_groups_per_file3/*.parquet')
----
3
# file rotation with return stats
query IIIIII
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_stats/' (FORMAT PARQUET, WRITE_EMPTY_FILE false, FILENAME_PATTERN '{uuid}', ROW_GROUP_SIZE 3000, ROW_GROUPS_PER_FILE 2, RETURN_STATS);
----
<REGEX>:.*row_groups_per_file_stats.*[a-zA-Z0-9-]{36}.parquet <REGEX>:\d+ <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"col_a"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}, '"col_b"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}} NULL
<REGEX>:.*row_groups_per_file_stats.*[a-zA-Z0-9-]{36}.parquet <REGEX>:\d+ <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"col_a"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}, '"col_b"'={column_size_bytes=\d+, max=\d+, min=\d+, null_count=0, num_values=\d+}} NULL
# now we crank up the threads
statement ok
PRAGMA verify_parallelism
statement ok
set threads=4
# with multiple threads writing to the same file this is best-effort
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file4' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file4/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file4/*.parquet')
----
true
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file5' (FORMAT PARQUET, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file5/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file5/*.parquet')
----
true
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file6' (FORMAT PARQUET, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file6/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 and 10 FROM glob('__TEST_DIR__/row_groups_per_file6/*.parquet')
----
true
# should work nicely with PER_THREAD_OUTPUT
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file7' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file7/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file7/*.parquet')
----
true
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file8' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 4000, ROW_GROUPS_PER_FILE 1)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file8/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file8/*.parquet')
----
true
statement ok
COPY bigdata TO '__TEST_DIR__/row_groups_per_file9' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, ROW_GROUP_SIZE 2000, ROW_GROUPS_PER_FILE 2)
query I
SELECT count(*) FROM '__TEST_DIR__/row_groups_per_file9/*.parquet'
----
10000
query I
SELECT count(*) BETWEEN 1 AND 10 FROM glob('__TEST_DIR__/row_groups_per_file9/*.parquet')
----
true
# doesn't work in combination with certain params
statement error
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_error' (FORMAT PARQUET, ROW_GROUPS_PER_FILE 1, USE_TMP_FILE TRUE);
----
Not implemented Error
statement error
COPY bigdata TO '__TEST_DIR__/row_groups_per_file_error' (FORMAT PARQUET, ROW_GROUPS_PER_FILE 1, PARTITION_BY col_a);
----
Not implemented Error
|