File: test_sniff_csv.test

package info (click to toggle)
duckdb 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 558
file content (141 lines) | stat: -rw-r--r-- 10,212 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# name: test/sql/copy/csv/test_sniff_csv.test
# description: Test sniff_csv function
# group: [csv]

statement ok
PRAGMA enable_verification

# requires notwindows because tests will return \r\n to be used in the parameters
require notwindows

# This has a row with an extra column, so with ignore_errors we give preference to that
query II
SELECT quote, escape from sniff_csv('{DATA_DIR}/csv/16857.csv', ignore_errors = true);
----
"	"

query II
SELECT quote, escape from sniff_csv('{DATA_DIR}/csv/16857.csv');
----
"	"

query III
SELECT escape,quote, delimiter from sniff_csv('{DATA_DIR}/csv/later_quotes.csv');
----
"	"	,

query I
SELECT Prompt FROM sniff_csv('{DATA_DIR}/csv/real/lineitem_sample.csv');
----
FROM read_csv('{DATA_DIR}/csv/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIIIIIIII
FROM read_csv('{DATA_DIR}/csv/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d')  limit 1;
----
1	15519	785	1	17	24386.67	0.04	0.02	N	O	1996-03-13	1996-02-12	1996-03-22	DELIVER IN PERSON	TRUCK	egular courts above the

query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/real/lineitem_sample.csv');
----
|	(empty)	(empty)	\n	(empty)	0	0	[{'name': column00, 'type': BIGINT}, {'name': column01, 'type': BIGINT}, {'name': column02, 'type': BIGINT}, {'name': column03, 'type': BIGINT}, {'name': column04, 'type': BIGINT}, {'name': column05, 'type': DOUBLE}, {'name': column06, 'type': DOUBLE}, {'name': column07, 'type': DOUBLE}, {'name': column08, 'type': VARCHAR}, {'name': column09, 'type': VARCHAR}, {'name': column10, 'type': DATE}, {'name': column11, 'type': DATE}, {'name': column12, 'type': DATE}, {'name': column13, 'type': VARCHAR}, {'name': column14, 'type': VARCHAR}, {'name': column15, 'type': VARCHAR}]	%Y-%m-%d	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

# Test Invalid Path
statement error
FROM sniff_csv('{DATA_DIR}/csv/real/non_ecziste.csv');
----
No files found that match the pattern "{DATA_DIR}/csv/real/non_ecziste.csv"

# Test different sample sizes

query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', sample_size=1);
----
,	(empty)	(empty)	\n	(empty)	0	0	[{'name': column0, 'type': BIGINT}, {'name': column1, 'type': VARCHAR}]	NULL	NULL	sample_size=1	FROM read_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column0': 'BIGINT', 'column1': 'VARCHAR'}, sample_size=1);

statement error
FROM read_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=0, columns={'column0': 'BIGINT', 'column1': 'VARCHAR'}, sample_size=1);
----
Conversion Error: CSV Error on Line: 2176

query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', sample_size=10000);
----
,	(empty)	(empty)	\n	(empty)	0	1	[{'name': 1, 'type': VARCHAR}, {'name': A, 'type': VARCHAR}]	NULL	NULL	sample_size=10000	FROM read_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=true, columns={'1': 'VARCHAR', 'A': 'VARCHAR'}, sample_size=10000);

query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', sample_size=-1);
----
,	(empty)	(empty)	\n	(empty)	0	1	[{'name': 1, 'type': VARCHAR}, {'name': A, 'type': VARCHAR}]	NULL	NULL	sample_size=-1	FROM read_csv('{DATA_DIR}/csv/error/mismatch/big_bad.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=true, columns={'1': 'VARCHAR', 'A': 'VARCHAR'}, sample_size=-1);

# Test with defined time and timestamp
query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/test/dateformat.csv')
----
,	(empty)	(empty)	\n	(empty)	0	0	[{'name': column0, 'type': DATE}]	%d/%m/%Y	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/test/dateformat.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column0': 'DATE'}, dateformat='%d/%m/%Y');

query I
FROM read_csv('{DATA_DIR}/csv/test/dateformat.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column0': 'DATE'}, dateformat='%d/%m/%Y');
----
2019-06-05


query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/auto/time_date_timestamp_yyyy.mm.dd.csv')
----
,	(empty)	(empty)	\n	(empty)	0	1	[{'name': a, 'type': BIGINT}, {'name': b, 'type': VARCHAR}, {'name': t, 'type': TIME}, {'name': d, 'type': DATE}, {'name': ts, 'type': TIMESTAMP}]	%Y.%m.%d	%Y.%m.%d %H:%M:%S	NULL	FROM read_csv('{DATA_DIR}/csv/auto/time_date_timestamp_yyyy.mm.dd.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=true, columns={'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'}, dateformat='%Y.%m.%d', timestampformat='%Y.%m.%d %H:%M:%S');

query IIIII
FROM read_csv('{DATA_DIR}/csv/auto/time_date_timestamp_yyyy.mm.dd.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=true, columns={'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'}, dateformat='%Y.%m.%d', timestampformat='%Y.%m.%d %H:%M:%S');
----
123	TEST2	12:12:12	2000-01-01	2000-01-01 12:12:00
345	TEST2	14:15:30	2002-02-02	2002-02-02 14:15:00
346	TEST2	15:16:17	2004-12-13	2004-12-13 15:16:00

# Test with dirty rows
query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/inconsistent_cells.csv')
----
,	(empty)	(empty)	\n	(empty)	5	0	[{'name': column0, 'type': BIGINT}, {'name': column1, 'type': BIGINT}, {'name': column2, 'type': BIGINT}, {'name': column3, 'type': BIGINT}, {'name': column4, 'type': BIGINT}]	NULL	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/inconsistent_cells.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=5, comment='', header=false, columns={'column0': 'BIGINT', 'column1': 'BIGINT', 'column2': 'BIGINT', 'column3': 'BIGINT', 'column4': 'BIGINT'});

query IIIII
FROM read_csv('{DATA_DIR}/csv/inconsistent_cells.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=5, header=false, columns={'column0': 'BIGINT', 'column1': 'BIGINT', 'column2': 'BIGINT', 'column3': 'BIGINT', 'column4': 'BIGINT'});
----
1	2	3	4	5
1	2	3	4	5

# Test Header and quote '
query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/timings.csv')
----
|	(empty)	(empty)	\n	(empty)	0	1	[{'name': tool, 'type': VARCHAR}, {'name': sf, 'type': BIGINT}, {'name': day, 'type': DATE}, {'name': batch_type, 'type': VARCHAR}, {'name': q, 'type': VARCHAR}, {'name': parameters, 'type': VARCHAR}, {'name': time, 'type': DOUBLE}]	%Y-%m-%d	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/timings.csv', auto_detect=false, delim='|', quote='', escape='', new_line='\n', skip=0, comment='', header=true, columns={'tool': 'VARCHAR', 'sf': 'BIGINT', 'day': 'DATE', 'batch_type': 'VARCHAR', 'q': 'VARCHAR', 'parameters': 'VARCHAR', 'time': 'DOUBLE'}, dateformat='%Y-%m-%d');

query IIIIIII
FROM read_csv('{DATA_DIR}/csv/timings.csv', auto_detect=false, delim='|', quote='"', escape='\', new_line='\n', skip=0, header=true, columns={'tool': 'VARCHAR', 'sf': 'BIGINT', 'day': 'DATE', 'batch_type': 'VARCHAR', 'q': 'VARCHAR', 'parameters': 'VARCHAR', 'time': 'DOUBLE'}, dateformat='%Y-%m-%d') order by all limit 1;
----
Umbra	100	2012-11-29	power	1	{"datetime": "2010-02-26T03:51:21.000+00:00"}	0.05473947525024414

# Test backslash option
query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/auto/backslash_escape.csv')
----
|	"	\	\n	(empty)	0	0	[{'name': column0, 'type': BIGINT}, {'name': column1, 'type': VARCHAR}, {'name': column2, 'type': VARCHAR}]	NULL	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/auto/backslash_escape.csv', auto_detect=false, delim='|', quote='"', escape='\', new_line='\n', skip=0, comment='', header=false, columns={'column0': 'BIGINT', 'column1': 'VARCHAR', 'column2': 'VARCHAR'});

query III
FROM read_csv('{DATA_DIR}/csv/auto/backslash_escape.csv', auto_detect=false, delim='|', quote='"', escape='\', new_line='\n', skip=0, header=false, columns={'column0': 'BIGINT', 'column1': 'VARCHAR', 'column2': 'VARCHAR'});
----
123	TEST7	text1
345	TEST7	text"2"
567	TEST7	text3

# Add tests with the comment option
query IIIIIIIIIIII
FROM sniff_csv('{DATA_DIR}/csv/comments/simple.csv');
----
;	(empty)	(empty)	\n	#	0	1	[{'name': a, 'type': BIGINT}, {'name': b, 'type': BIGINT}]	NULL	NULL	NULL	FROM read_csv('{DATA_DIR}/csv/comments/simple.csv', auto_detect=false, delim=';', quote='', escape='', new_line='\n', skip=0, comment='#', header=true, columns={'a': 'BIGINT', 'b': 'BIGINT'});

# Test Prompt
query II
FROM read_csv('{DATA_DIR}/csv/comments/simple.csv', auto_detect=false, delim=';', quote='', escape='', new_line='\n', skip=0, comment='#', header=true, columns={'a': 'BIGINT', 'b': 'BIGINT'});
----
1	3
6	7