File: accuracy.sql

package info (click to toggle)
tdigest 1.4.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 588 kB
  • sloc: ansic: 2,013; sql: 1,935; makefile: 15; sh: 1
file content (156 lines) | stat: -rw-r--r-- 8,310 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
drop table if exists t;
create table t (v double precision);

drop table if exists datasets;
create table datasets (ds_name text, ds_sql text);

insert into datasets values ('uniform', 'with d as (select pow(random(), 1) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('pow(2)', 'with d as (select pow(random(), 2) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('pow(4)', 'with d as (select pow(random(), 4) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('pow(0.5)', 'with d as (select pow(random(), 0.5) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('pow(0.25)', 'with d as (select pow(random(), 0.25) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('1 - pow(2)', 'with d as (select 1.0 - pow(random(), 2) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('1 - pow(4)', 'with d as (select 1.0 - pow(random(), 4) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('1 - pow(0.5)', 'with d as (select 1.0 - pow(random(), 0.5) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');

insert into datasets values ('1 - pow(0.25)', 'with d as (select 1.0 - pow(random(), 0.25) as v from generate_series(1,%s))
insert into t select v from (select v, generate_series(1, %s + (%s * random())::int) from d) foo');


create or replace function test_queries(npercentiles int, p double precision, nvalues int, minvalues int, maxvalues int, out dataset text,
                                        out simple_random double precision, out simple_asc double precision, out simple_desc double precision,
                                        out preagg_random double precision, out preagg_asc double precision, out preagg_desc double precision,
                                        out simple_asc_cmp double precision, out simple_desc_cmp double precision,
                                        out preagg_random_cmp double precision, out preagg_asc_cmp double precision,
                                        out preagg_desc_cmp double precision)
returns setof record language plpgsql as $$
declare
    d record;
    perc_cont_percs double precision[];
    simple_random_percs double precision[];
    simple_asc_percs double precision[];
    simple_desc_percs double precision[];
    preagg_random_percs double precision[];
    preagg_asc_percs double precision[];
    preagg_desc_percs double precision[];
    percs double precision[];
    run int;
    tmp_simple_random double precision;
    tmp_simple_asc double precision;
    tmp_simple_desc double precision;
    tmp_preagg_random double precision;
    tmp_preagg_asc double precision;
    tmp_preagg_desc double precision;
begin

    raise notice 'percentiles % range % values % min % max %', npercentiles, p, nvalues, minvalues, maxvalues;

    -- generate percentiles
    select array_agg(x) into percs from (
      select i::double precision / npercentiles as x from generate_series(1,npercentiles) s(i)
    ) foo where x <= p or x > 1.0 - p;

    for d in (select * from datasets order by ds_name) loop

        simple_random := 0;
        simple_asc := 0;
        simple_desc := 0;
        preagg_random := 0;
        preagg_asc := 0;
        preagg_desc := 0;

        for run in 1..10 loop

            -- rebuild the table
            execute 'truncate t';
            execute format(d.ds_sql, nvalues, minvalues, (maxvalues - minvalues));
            execute 'analyze t';

            dataset := d.ds_name;

            select percentile_cont(percs) within group (order by v) into perc_cont_percs from (select * from t) d;

            select tdigest_percentile(v, 100, percs) into simple_random_percs from (select * from t order by random()) d;
            select tdigest_percentile(v, 100, percs) into simple_asc_percs from (select * from t order by v) d;
            select tdigest_percentile(v, 100, percs) into simple_desc_percs from (select * from t order by v desc) d;

            select tdigest_percentile(v, c, 100, percs) into preagg_random_percs from (select v, count(*) as c from t group by v order by random()) d;
            select tdigest_percentile(v, c, 100, percs) into preagg_asc_percs from (select v, count(*) as c from t group by v order by v) d;
            select tdigest_percentile(v, c, 100, percs) into preagg_desc_percs from (select v, count(*) as c from t group by v order by v desc) d;

            select sqrt(sum(pow(a-b,2))) into tmp_simple_random from (select unnest(perc_cont_percs) as a, unnest(simple_random_percs) as b) d;
            select sqrt(sum(pow(a-b,2))) into tmp_simple_asc from (select unnest(perc_cont_percs) as a, unnest(simple_asc_percs) as b) d;
            select sqrt(sum(pow(a-b,2))) into tmp_simple_desc from (select unnest(perc_cont_percs) as a, unnest(simple_desc_percs) as b) d;
            select sqrt(sum(pow(a-b,2))) into tmp_preagg_random from (select unnest(perc_cont_percs) as a, unnest(preagg_random_percs) as b) d;
            select sqrt(sum(pow(a-b,2))) into tmp_preagg_asc from (select unnest(perc_cont_percs) as a, unnest(preagg_asc_percs) as b) d;
            select sqrt(sum(pow(a-b,2))) into tmp_preagg_desc from (select unnest(perc_cont_percs) as a, unnest(preagg_desc_percs) as b) d;

            simple_random := simple_random + tmp_simple_random / 10;
            simple_asc := simple_asc + tmp_simple_asc / 10;
            simple_desc := simple_desc + tmp_simple_desc / 10;
            preagg_random := preagg_random + tmp_preagg_random / 10;
            preagg_asc := preagg_asc + tmp_preagg_asc / 10;
            preagg_desc := preagg_desc + tmp_preagg_desc / 10;

        end loop;

        simple_asc_cmp := round((simple_asc / simple_random)::numeric, 2);
        simple_desc_cmp := round((simple_desc / simple_random)::numeric, 2);
        preagg_random_cmp := round((preagg_random / simple_random)::numeric, 2);
        preagg_asc_cmp := round((preagg_asc / simple_random)::numeric, 2);
        preagg_desc_cmp := round((preagg_desc / simple_random)::numeric, 2);

        simple_random := round(simple_random::numeric, 6);
        simple_asc := round(simple_asc::numeric, 6);
        simple_desc := round(simple_desc::numeric, 6);
        preagg_random := round(preagg_random::numeric, 6);
        preagg_asc := round(preagg_asc::numeric, 6);
        preagg_desc := round(preagg_desc::numeric, 6);

        return next;

    end loop;

    return;

end;
$$;

select * from test_queries(1000, 0.01, 10000, 1, 1);
select * from test_queries(1000, 0.05, 10000, 1, 1);
select * from test_queries(1000, 0.1, 10000, 1, 1);
select * from test_queries(1000, 0.2, 10000, 1, 1);
select * from test_queries(1000, 0.3, 10000, 1, 1);
select * from test_queries(1000, 0.4, 10000, 1, 1);
select * from test_queries(1000, 0.5, 10000, 1, 1);

select * from test_queries(1000, 0.01, 1000, 10, 20);
select * from test_queries(1000, 0.05, 1000, 10, 20);
select * from test_queries(1000, 0.1, 1000, 10, 20);
select * from test_queries(1000, 0.2, 1000, 10, 20);
select * from test_queries(1000, 0.3, 1000, 10, 20);
select * from test_queries(1000, 0.4, 1000, 10, 20);
select * from test_queries(1000, 0.5, 1000, 10, 20);

select * from test_queries(1000, 0.01, 10000, 10, 20);
select * from test_queries(1000, 0.05, 10000, 10, 20);
select * from test_queries(1000, 0.1, 10000, 10, 20);
select * from test_queries(1000, 0.2, 10000, 10, 20);
select * from test_queries(1000, 0.3, 10000, 10, 20);
select * from test_queries(1000, 0.4, 10000, 10, 20);
select * from test_queries(1000, 0.5, 10000, 10, 20);