1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
|
# name: test/sql/copy/return_stats_truncate.test
# description: test truncation of large strings/blobs in parquet stats
# group: [copy]
require parquet
# we cannot truncate blobs that are all 0xFF - so stats are omitted
query IIIIII
COPY (SELECT repeat('\xFF', 300)::BLOB as blob) TO '__TEST_DIR__/test_truncate_blob.parquet' (RETURN_STATS);
----
<REGEX>:.*test_truncate_blob.parquet 1 <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"blob"'={column_size_bytes=\d+, null_count=0, num_values=1}} NULL
query II
SELECT min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/test_truncate_blob.parquet');
----
NULL NULL
# if there is any byte that is less than 0xFF we can truncate
query IIIIII
COPY (SELECT ('\xFE' || repeat('\xFF', 300))::BLOB as blob) TO '__TEST_DIR__/test_truncate_blob.parquet' (RETURN_STATS);
----
<REGEX>:.*test_truncate_blob.parquet 1 <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"blob"'={column_size_bytes=\d+, max=FF, min=FEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF, null_count=0, num_values=1}} NULL
query II
SELECT min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/test_truncate_blob.parquet');
----
false false
# for strings we don't truncate unicode characters
query IIIIII
COPY (SELECT repeat('🦆', 300) as str) TO '__TEST_DIR__/test_truncate_string.parquet' (RETURN_STATS);
----
<REGEX>:.*test_truncate_string.parquet 1 <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"str"'={column_size_bytes=\d+, null_count=0, num_values=1}} NULL
query II
SELECT min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/test_truncate_string.parquet');
----
NULL NULL
# but we can truncate if there is a single ascii character in the string
query IIIIII
COPY (SELECT 'B' || repeat('🦆', 300) as str) TO '__TEST_DIR__/test_truncate_string.parquet' (RETURN_STATS);
----
<REGEX>:.*test_truncate_string.parquet 1 <REGEX>:\d+ <REGEX>:\d+ <REGEX>:{'"str"'={column_size_bytes=\d+, max=C, min=B🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆🦆, null_count=0, num_values=1}} NULL
query II
SELECT min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/test_truncate_string.parquet');
----
false false
|