File: csv_hive.test

package info (click to toggle)
duckdb 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 558
file content (151 lines) | stat: -rw-r--r-- 7,561 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# name: test/sql/copy/csv/csv_hive.test
# description: Test the automatic parsing of the hive partitioning scheme
# group: [csv]

statement ok
PRAGMA enable_verification

statement ok
CREATE TABLE test AS SELECT 1 as id, 'value1' as value;
CREATE TABLE test2 AS SELECT 2 as id, 'value2' as value;

# filenames could allow you to parse hive partitions manually using SQL
query III
SELECT id, value, parse_path(filename)[-4:] from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/simple/*/*/test.csv', FILENAME=1) order by id
----
1	value1	[simple, 'part=a', 'date=2012-01-01', test.csv]
2	value2	[simple, 'part=b', 'date=2013-01-01', test.csv]

# however this is just a lot nicer
query IIII
select id, value, part, date from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/simple/*/*/test.csv', HIVE_PARTITIONING=1) order by id
----
1	value1	a	2012-01-01
2	value2	b	2013-01-01

# As long as the names match, we don't really mind since everything is a string anyway
query IIII
select id, value, part, date from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1)  order by id
----
1	value1	a	2012-01-01
2	value2	b	2013-01-01

# If the key names don't add up, we throw
statement error
select * from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/mismatching_names/*/*/test.csv', HIVE_PARTITIONING=1)
----
Hive partition mismatch

# If the key names don't add up, we throw
statement error
select * from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/mismatching_count/*/*/test.csv', HIVE_PARTITIONING=1)
----
Hive partition mismatch

# Now we do a bunch of filtering on the partitions, to test the file skipping mechanism
query IIII
select id, value, part, date from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) where part='a'
----
1	value1	a	2012-01-01

query IIII
select id, value, part, date from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) where part='b'
----
2	value2	b	2013-01-01

query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where part_cast > 0 and part_cast < 5000;
----
1	value1	1000	2012-01-01

query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where part_cast > 5000;
----
2	value2	9000	2013-01-01

query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast > CAST('2000-01-01' as DATE) and date_cast < CAST('2012-12-12' as DATE);
----
1	value1	1000	2012-01-01

query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast > CAST('2000-01-01' as DATE) order by date_cast;
----
1	value1	1000	2012-01-01
2	value2	9000	2013-01-01

query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast=CAST('2012-01-01' as DATE) OR part_cast=9000 ORDER BY date_cast;
----
1	value1	1000	2012-01-01
2	value2	9000	2013-01-01

## Filter expressions we can calculate during pushdown using filenames/hive partitions should be pruned

# Filtering out 0/2 files
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=9000) ORDER BY date_cast;
----
1	value1	1000	2012-01-01
2	value2	9000	2013-01-01

# There should not be any filter operation remaining since it can be handled completely during pushdown by pruning file list
query II
EXPLAIN select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=9000) ORDER BY date_cast;
----
physical_plan	<!REGEX>:.*FILTER.*

# Query filtering out first file
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=9000) ORDER BY date_cast;
----
2	value2	9000	2013-01-01

# Again, we should not have a filter operator here
query II
explain select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=9000) ORDER BY date_cast;
----
physical_plan	<!REGEX>:.*FILTER.*

# Query filtering out second file
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=1337) ORDER BY date_cast;
----
1	value1	1000	2012-01-01

# Again, we should not have a filter operator here
query II
explain select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=1337) ORDER BY date_cast;
----
physical_plan	<!REGEX>:.*FILTER.*

# Filtering out both files
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=1337) ORDER BY date_cast;
----

# Again, we should not have a filter operator here
query II
EXPLAIN select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=1337) ORDER BY date_cast;
----
physical_plan	<!REGEX>:.*FILTER.*

# projection pushdown
query I
select value from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1)  order by 1
----
value1
value2

query I
select part from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1)  order by 1
----
a
b

# project only some columns from a hive partition
query I
select date from read_csv_auto('{DATA_DIR}/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1)  order by 1
----
2012-01-01
2013-01-01