1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
|
import duckdb
# basic SQL API
# connect to an in-memory temporary database
conn = duckdb.connect()
# if you want, you can create a cursor() like described in PEP 249 but it's fully redundant
cursor = conn.cursor()
# run arbitrary SQL commands
conn.execute("CREATE TABLE test_table (i INTEGER, j STRING)")
# add some data
conn.execute("INSERT INTO test_table VALUES (1, 'one')")
# we can use placeholders for parameters
conn.execute("INSERT INTO test_table VALUES (?, ?)", [2, 'two'])
# we can provide multiple sets of parameters to executemany()
conn.executemany("INSERT INTO test_table VALUES (?, ?)", [[3, 'three'], [4, 'four']])
# fetch as pandas data frame
print(conn.execute("SELECT * FROM test_table").fetchdf())
# fetch as list of masked numpy arrays, cleaner when handling NULLs
print(conn.execute("SELECT * FROM test_table").fetchnumpy())
# we can query pandas data frames as if they were SQL views
# create a sample pandas data frame
import pandas as pd
test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]})
# make this data frame available as a view in duckdb
conn.register("test_df", test_df)
print(conn.execute("SELECT j FROM test_df WHERE i > 1").fetchdf())
# relation API, programmatic querying. relations are lazily evaluated chains of relational operators
# create a "relation" from a pandas data frame with an existing connection
rel = conn.from_df(test_df)
print(rel)
# alternative shorthand, use a built-in default connection to create a relation from a pandas data frame
rel = duckdb.df(test_df)
print(rel)
# create a relation from a CSV file
# first create a CSV file from our pandas example
import tempfile, os
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))
test_df.to_csv(temp_file_name, index=False)
# now create a relation from it
rel = duckdb.from_csv_auto(temp_file_name)
print(rel)
# create a relation from an existing table
rel = conn.table("test_table")
print(rel)
# a relation has an alias (like a table name)
print(rel.alias)
# we can change the alias, useful for (self)joins for example
rel2 = rel.set_alias('new_alias')
print(rel2.alias)
# we can inspect the type of a relation
print(rel.type)
# or the column names that are in it
print(rel.columns)
# or the types of those columns
print(rel.types)
# now we can apply some operators to the relation
# filter the relation
print(rel.filter('i > 1'))
# project the relation, get some columns
print(rel.project('i, j'))
# or transform them
print(rel.project('i + 1'))
# order the relation
print(rel.order('j'))
# limit the rows returned
print(rel.limit(2))
# skip the first row and limit the number of results
print(rel.limit(2, offset=1))
# of course these things can be chained
print(rel.filter('i > 1').project('i + 1, j').order('j').limit(2))
# aggregate the relation
print(rel.aggregate("sum(i)"))
# non-aggregated columns create implicit grouping
print(rel.aggregate("j, sum(i)"))
# we can also explicit group the relation before aggregating
print(rel.aggregate("sum(i)", "j"))
# distinct values
print(rel.distinct())
# multi-relation operators are also supported, e.g union
print(rel.union(rel))
# join rel with itself on i
rel2 = conn.from_df(test_df)
print(rel.join(rel2, 'i'))
# for explicit join conditions the relations can be named using alias()
print(rel.set_alias('a').join(rel.set_alias('b'), 'a.i=b.i'))
# there are also shorthand methods to directly create a relation and apply an operator from pandas data frame objects
print(duckdb.filter(test_df, 'i > 1'))
print(duckdb.project(test_df, 'i + 1'))
print(duckdb.order(test_df, 'j'))
print(duckdb.limit(test_df, 2))
print(duckdb.aggregate(test_df, "sum(i)"))
print(duckdb.distinct(test_df))
# when chaining only the first call needs to include the data frame parameter
print(duckdb.filter(test_df, 'i > 1').project('i + 1, j').order('j').limit(2))
# turn the relation into something else again
# compute the query result from the relation
res = rel.execute()
print(res)
# res is a query result, you can call fetchdf() or fetchnumpy() or fetchone() on it
print(res.fetchone())
print(res.fetchall())
# convert a relation back to a pandas data frame
print(rel.to_df())
# df() is shorthand for to_df() on relations
print(rel.df())
# create a table in duckdb from the relation
print(rel.create("test_table2"))
# insert the relation's data into an existing table
conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)")
print(rel.insert_into("test_table3"))
# Inserting elements into table_3
print(conn.values([5, 'five']).insert_into("test_table3"))
rel_3 = conn.table("test_table3")
rel_3.insert([6, 'six'])
# create a SQL-accessible view of the relation
print(rel.create_view('test_view'))
# we can also directly run SQL queries on relation objects without explicitly creating a view
# the first parameter gives the rel object a view name so we can refer to it in queries
res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel')
print(res)
# res is a query result, we can fetch with the methods described above, e.g.
print(res.fetchone())
print(res.fetchdf())
# or just use df(), a shorthand for fetchdf() on query results
print(res.df())
# this also works directly on data frames
res = duckdb.query_df(test_df, 'my_name_for_test_df', 'SELECT * FROM my_name_for_test_df')
print(res.df())
|