File: arrow_filter_pushdown.cpp

package info (click to toggle)
duckdb 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 558
file content (122 lines) | stat: -rw-r--r-- 5,047 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <regex>

#include "catch.hpp"

#include "arrow/arrow_test_helper.hpp"

using namespace duckdb;

// Helper: create an ArrowTestFactory from a query result (all rows, no filter)
static unique_ptr<ArrowTestFactory> MakeArrowFactory(Connection &con, const string &query, bool use_string_view) {
	if (use_string_view) {
		REQUIRE(!con.Query("SET produce_arrow_string_view = true")->HasError());
		REQUIRE(!con.Query("SET arrow_output_version = '1.5'")->HasError());
	} else {
		REQUIRE(!con.Query("SET produce_arrow_string_view = false")->HasError());
		REQUIRE(!con.Query("SET arrow_output_version = '1.0'")->HasError());
	}
	auto client_properties = con.context->GetClientProperties();
	auto result = con.context->Query(query, false);
	REQUIRE(!result->HasError());
	auto types = result->types;
	auto names = result->names;
	return make_uniq<ArrowTestFactory>(std::move(types), std::move(names), std::move(result), false, client_properties,
	                                   *con.context);
}

// Helper: get the EXPLAIN output for an arrow_scan with a filter
static string GetExplainForFilter(Connection &con, ArrowTestFactory &factory, const string &filter_expr) {
	const auto params = ArrowTestHelper::ConstructArrowScan(factory);
	const auto rel = con.TableFunction("arrow_scan", params)->Filter(filter_expr);
	const auto explain_result = rel->Explain();
	REQUIRE(!explain_result->HasError());
	auto &mat = explain_result->Cast<MaterializedQueryResult>();
	return mat.GetValue(1, 0).ToString();
}

// Helper: regexp search for a filter node
static bool StandaloneFilter(const std::string &explain_str) {
	// This is meant to match e.g.:
	//
	// ┌─────────────┴─────────────┐
	// │           FILTER          │
	// │    ────────────────────   │
	// │  ((a > 25) AND (b > 25))  │
	// │                           │
	// │          ~20 rows         │
	// └─────────────┬─────────────┘
	//
	const std::regex re_filter(R"(│ +FILTER +│)");
	return std::regex_search(explain_str, re_filter);
}

// Helper: regexp search for a scan node with filter
static bool FilterInScan(const std::string &explain_str) {
	// This is meant to match e.g.:
	//
	// ┌─────────────┴─────────────┐
	// │         ARROW_SCAN        │
	// │    ────────────────────   │
	// │    Function: ARROW_SCAN   │
	// │                           │
	// │          Filters:         │
	// │            a>25           │
	// │            b>25           │
	// │                           │
	// │           ~1 row          │
	// └───────────────────────────┘
	//
	const std::regex re_block(R"(│[ \t]*Function:[ \t]*ARROW_SCAN[ \t]*│[\s\S]*?│[ \t]*Filters:[ \t]*([^│]*?)[ \t]*│)");
	return std::regex_search(explain_str, re_block);
}

TEST_CASE("Arrow filter pushdown - view types disable pushdown", "[arrow]") {
	DuckDB db;
	Connection con(db);

	// Create a test table with an INT id column and a VARCHAR name column
	REQUIRE(!con.Query("CREATE TABLE src AS SELECT i AS id, i::VARCHAR AS name FROM range(10) tbl(i)")->HasError());

	SECTION("String view column: filter above scan (not pushed)") {
		auto factory = MakeArrowFactory(con, "SELECT * FROM src", true);
		auto explain_str = GetExplainForFilter(con, *factory, "id > 5");
		REQUIRE(StandaloneFilter(explain_str));
		REQUIRE(!FilterInScan(explain_str));
	}

	SECTION("Regular string column: filter pushed into scan") {
		auto factory = MakeArrowFactory(con, "SELECT * FROM src", false);
		auto explain_str = GetExplainForFilter(con, *factory, "id > 5");
		REQUIRE(!StandaloneFilter(explain_str));
		REQUIRE(FilterInScan(explain_str));
	}

	SECTION("Integer-only table: filter pushed into scan") {
		auto factory = MakeArrowFactory(con, "SELECT id FROM src", false);
		auto explain_str = GetExplainForFilter(con, *factory, "id > 5");
		REQUIRE(!StandaloneFilter(explain_str));
		REQUIRE(FilterInScan(explain_str));
	}
}

TEST_CASE("Arrow filter pushdown - nested view types disable pushdown", "[arrow]") {
	DuckDB db;
	Connection con(db);

	REQUIRE(!con.Query("CREATE TABLE src AS SELECT i AS id, 'val_' || i::VARCHAR AS name FROM range(10) tbl(i)")
	             ->HasError());

	SECTION("Struct containing string_view") {
		auto factory = MakeArrowFactory(con, "SELECT id, {'s': name} AS nested FROM src", true);
		auto explain_str = GetExplainForFilter(con, *factory, "id > 5");
		REQUIRE(StandaloneFilter(explain_str));
		REQUIRE(!FilterInScan(explain_str));
	}

	SECTION("List containing string_view") {
		auto factory = MakeArrowFactory(con, "SELECT id, [name] AS names FROM src", true);
		auto explain_str = GetExplainForFilter(con, *factory, "id > 5");
		REQUIRE(StandaloneFilter(explain_str));
		REQUIRE(!FilterInScan(explain_str));
	}
}