File: rewriter.rb

package info (click to toggle)
gitlab 17.6.5-19
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 629,368 kB
  • sloc: ruby: 1,915,304; javascript: 557,307; sql: 60,639; xml: 6,509; sh: 4,567; makefile: 1,239; python: 406
file content (437 lines) | stat: -rw-r--r-- 14,319 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# frozen_string_literal: true

# rubocop:disable CodeReuse/ActiveRecord -- This module is generating ActiveRecord relations therefore using AR methods is necessary
module UnnestedInFilters
  class Rewriter
    include Gitlab::Utils::StrongMemoize

    class ValueTable
      def initialize(model, attribute, values)
        @model = model
        @attribute = attribute.to_s
        @values = values
      end

      def to_sql
        "#{serialized_values} AS #{table_name}(#{column_name})"
      end

      def as_predicate
        "#{model.table_name}.#{column_name} = #{table_name}.#{column_name}"
      end

      private

      attr_reader :model, :attribute, :values

      delegate :connection, :columns, :attribute_types, to: :model, private: true
      delegate :quote, :quote_table_name, :quote_column_name, :visitor, to: :connection

      def table_name
        quote_table_name(attribute.pluralize)
      end

      def column_name
        quote_column_name(attribute)
      end

      def serialized_values
        if values.is_a?(Arel::Nodes::SelectStatement)
          "(#{serialized_arel_value})"
        else
          "unnest(#{serialized_array_values}::#{sql_type}[])"
        end
      end

      def serialized_arel_value
        visitor.compile(values, unprepared_statement_collector)
      end

      def serialized_array_values
        values.map(&:value)
              .then { |value| array_type.serialize(value) }
              .then { |array| quote(array) }
      end

      def array_type
        ActiveRecord::ConnectionAdapters::PostgreSQL::OID::Array.new(attribute_types[attribute])
      end

      def sql_type
        column.sql_type_metadata.sql_type
      end

      def column
        columns.find { |column| column.name == attribute }
      end

      def unprepared_statement_collector
        Arel::Collectors::SubstituteBinds.new(
          connection,
          Arel::Collectors::SQLString.new
        )
      end
    end

    # A naive query planner implementation.
    # Checks if a database-level index can be utilized by given filtering and ordering predicates.
    #
    # Supported index conditions:
    #     - All columns queried are present in the index or partial predicate
    #     - Unqueried index columns are at the end of the index
    #     - Partial indices if the partial index predicate contains only one column.
    #     - Only the `=` operator present in partial index predicate.
    #
    # Examples:
    #
    # ------------------------------------------------------------------------------
    # Queried             | Index                                      | Supported?
    # (col_1, col_2)      | (col_1, col_2)                             | Y
    # (col_1)             | (col_1, col_2)                             | Y
    # (col_2)             | (col_1, col_2)                             | N
    # (col_1, col_3)      | (col_1, col_2, col_3)                      | N
    # (col_1, col_2)      | (col_1) where col_2 = "1"                  | Y
    # (col_1, col_2)      | (col_1) where col_2 <= 1                   | N
    # (col_1, col_2)      | (col_1) where col_2 IS NULL                | N
    # (col_1, col_2)      | (col_1) where col_2 = "1" AND COL_1 = "2"  | N
    #
    class IndexCoverage
      PARTIAL_INDEX_REGEX = /(?<!\s)(?>\(*(?<column_name>\b\w+)\s*=\s*(?<column_value>\w+)\)*)(?!\s)/

      def initialize(index, where_hash, order_attributes)
        @index = index
        @where_hash = where_hash
        @order_attributes = order_attributes
      end

      def covers?
        filter_attributes_covered?            &&
          unused_columns_at_end_of_index?     &&
          can_be_used_for_sorting?
      end

      private

      attr_reader :index, :where_hash, :order_attributes

      def filter_attributes_covered?
        partial? ? partial_index_coverage? : full_index_coverage?
      end

      def can_be_used_for_sorting?
        sorts_in_same_order_as_index? &&
          no_filtering_after_sort_columns?
      end

      # All order attributes exist in the query in the same order as they are queried.
      def sorts_in_same_order_as_index?
        (index.columns & order_attributes) == order_attributes
      end

      # All order attributes exist in the query in the same order as they are queried.
      # We rely on sort order to be the same here to assume that anything following the last sort
      # should not be filtered on.
      def no_filtering_after_sort_columns?
        return true if order_attributes.empty?

        (index.columns.split(order_attributes.last).last & filter_attributes).empty?
      end

      # We assume there are <= attributes than columns because filter_attributes_covered has already passed.
      # So we check the count of unqueried columns, take that number from the end of the index,
      # and compare to ensure that any unqueried columns are only at the end of the index.
      # This also helps ensure there are no gaps in the used columns of the index.
      def unused_columns_at_end_of_index?
        remaining_columns = (index.columns - combined_attributes)

        (index.columns.last(remaining_columns.size) - remaining_columns).empty?
      end

      def partial?
        index.where.present?
      end

      def full_index_coverage?
        (filter_attributes - Array(index.columns)).empty?
      end

      def partial_index_coverage?
        return false unless partial_column

        full_index_coverage_with_partial_column? && partial_filter_matches?
      end

      def full_index_coverage_with_partial_column?
        (filter_attributes - Array(index.columns) - Array(partial_column)).empty?
      end

      def combined_attributes
        filter_attributes + order_attributes
      end

      def filter_attributes
        @filter_attributes ||= where_hash.keys
      end

      def partial_filter_matches?
        partial_filter == partial_value
      end

      def partial_filter
        where_hash[partial_column].to_s
      end

      def partial_column
        index_filter['column_name']
      end

      def partial_value
        index_filter['column_value']
      end

      def index_filter
        @index_filter ||= index.where.match(PARTIAL_INDEX_REGEX)&.named_captures.to_h
      end
    end

    def initialize(relation)
      @relation = relation
    end

    # Rewrites the given ActiveRecord::Relation object to
    # utilize the DB indices efficiently.
    #
    # Currently Postgres will produce inefficient query plans which use a `filter_predicate`
    # instead of a `access_predicate` to filter by IN clause contents. This behaviour does a table
    # read of the data for filtering, disregarding the structure of the index and losing any benefit
    # from any sorting applied to the index as it will have to resort the table read data.
    #
    # Rewriting the query using the `unnest` command induces Postgres into using the
    # appropriate index search behaviour for each column in the index by generating a
    # cartesian product between the individual items of the IN filter items and queried table.
    # This means each read column will maintain the sort order provided by the index,
    # avoiding a memory sort node in the final query plan.
    #
    # This will not work if queried columns are not all present in the index, or if unqueried
    # columns exist in the index that are not at the end, as this makes that part of the index
    # useless to Postgres and will result in a table scan anyways from that point.
    #
    #
    # Example usage;
    #
    #   relation = Vulnerabilities::Read.where(state: [1, 4])
    #   relation = relation.order(severity: :desc, vulnerability_id: :desc)
    #
    #   rewriter = UnnestedInFilters::Rewriter.new(relation)
    #   optimized_relation = rewriter.rewrite
    #
    # In the above example. the `relation` object would produce the following SQL query;
    #
    #   SELECT
    #     "vulnerability_reads".*
    #   FROM
    #     "vulnerability_reads"
    #   WHERE
    #     "vulnerability_reads"."state" IN (1, 4)
    #   ORDER BY
    #     "vulnerability_reads"."severity" DESC,
    #     "vulnerability_reads"."vulnerability_id" DESC
    #   LIMIT 20;
    #
    # And the `optimized_relation` object would would produce the following query to
    # utilize the index on (state, severity, vulnerability_id);
    #
    #   SELECT
    #     "vulnerability_reads".*
    #   FROM
    #     unnest('{1, 4}'::smallint[]) AS "states" ("state"),
    #     LATERAL (
    #       SELECT
    #         "vulnerability_reads".*
    #       FROM
    #         "vulnerability_reads"
    #       WHERE
    #         (vulnerability_reads."state" = "states"."state")
    #       ORDER BY
    #         "vulnerability_reads"."severity" DESC,
    #         "vulnerability_reads"."vulnerability_id" DESC
    #       LIMIT 20) AS vulnerability_reads
    #   ORDER BY
    #     "vulnerability_reads"."severity" DESC,
    #     "vulnerability_reads"."vulnerability_id" DESC
    #   LIMIT 20
    #
    # If one of the columns being used for filtering or ordering is the primary key,
    # then the query will be further optimized to use an index-only scan for initial filtering
    # before selecting all columns using the primary key.
    #
    # Using the prior query as an example, where `vulnerability_id` is the primary key,
    # This will be rewritten to:
    #
    #   SELECT
    #     "vulnerability_reads".*
    #   FROM
    #     "vulnerability_reads"
    #   WHERE
    #     "vulnerability_reads"."vulnerability_id"
    #   IN (
    #     SELECT
    #       "vulnerability_reads"."vulnerability_id"
    #     FROM
    #       unnest('{1, 4}'::smallint[]) AS "states" ("state"),
    #       LATERAL (
    #         SELECT
    #           "vulnerability_reads"."vulnerability_id"
    #         FROM
    #           "vulnerability_reads"
    #         WHERE
    #           (vulnerability_reads."state" = "states"."state")
    #         ORDER BY
    #           "vulnerability_reads"."severity" DESC,
    #           "vulnerability_reads"."vulnerability_id" DESC
    #           LIMIT 20
    #         ) AS vulnerability_reads
    #      )
    #   ORDER BY
    #     "vulnerability_reads"."severity" DESC,
    #     "vulnerability_reads"."vulnerability_id" DESC
    #   LIMIT 20
    def rewrite
      log_rewrite

      return filter_query unless primary_key_present?

      index_only_filter_query
    end

    def rewrite?
      strong_memoize(:rewrite) do
        in_filters.present? && has_index_coverage?
      end
    end

    private

    attr_reader :relation

    delegate :model, :order_values, :limit_value, :where_values_hash, :where_clause, to: :relation, private: true

    def log_rewrite
      ::Gitlab::AppLogger.info(message: 'Query is being rewritten by `UnnestedInFilters`', model: model.name)
    end

    def filter_query
      model.from(from).then { |relation| add_relation_defaults(relation) }
    end

    def index_only_filter_query
      model.where(model.primary_key => filter_query.select(model.primary_key))
           .then { |relation| add_relation_defaults(relation) }
    end

    def add_relation_defaults(new_relation)
      new_relation.limit(limit_value)
                  .order(order_values)
                  .includes(relation.includes_values)
                  .preload(relation.preload_values)
                  .eager_load(relation.eager_load_values)
    end

    def from
      [value_tables.map(&:to_sql) + [lateral]].join(', ')
    end

    def lateral
      "LATERAL (#{join_relation.to_sql}) AS #{model.table_name}"
    end

    def join_relation
      join_relation = value_tables.reduce(unscoped_relation) do |memo, tmp_table|
        memo.where(tmp_table.as_predicate)
      end

      join_relation = join_relation.select(combined_attributes) if primary_key_present?

      join_relation
    end

    def unscoped_relation
      relation.unscope(where: in_filters.keys)
    end

    def in_filters
      @in_filters ||= arel_in_nodes.each_with_object({}) { |node, memo| memo[node.left.name] = node.right }
    end

    def model_column_names
      @model_column_names ||= model.columns.map(&:name)
    end

    # Actively filter any nodes that don't belong to the primary queried table to prevent sql type resolution issues
    # Context: https://gitlab.com/gitlab-org/gitlab/-/issues/370271#note_1151019824
    def arel_in_nodes
      where_clause_arel_nodes
        .select { |arel_node| in_predicate?(arel_node) }
        .select { |arel_node| model_column_names.include?(arel_node.left.name) }
    end

    # `ActiveRecord::WhereClause#ast` is returning a single node when there is only one
    # predicate but returning an `Arel::Nodes::And` node if there are more than one predicates.
    # This is why we are checking the returned object responds to `children` or not.
    def where_clause_arel_nodes
      return [where_clause_ast] unless where_clause_ast.respond_to?(:children)

      where_clause_ast.children
    end

    def where_clause_ast
      @where_clause_ast ||= where_clause.ast
    end

    def in_predicate?(arel_node)
      arel_node.is_a?(Arel::Nodes::HomogeneousIn) || arel_node.is_a?(Arel::Nodes::In)
    end

    def has_index_coverage?
      indices.any?(&:covers?)
    end

    def primary_key_present?
      combined_attributes.include?(model.primary_key)
    end

    def combined_attributes
      filter_attributes + order_attributes
    end

    def filter_attributes
      @filter_attributes ||= where_clause.to_h.keys
    end

    def order_attributes
      @order_attributes ||= order_values.flat_map { |order_value| extract_column_name(order_value) }
    end

    def extract_column_name(order_value)
      case order_value
      when Arel::Nodes::Ordering
        order_value.expr.name
      when ::Gitlab::Pagination::Keyset::Order
        order_value.attribute_names
      end
    end

    def indices
      model.connection.schema_cache.indexes(model.table_name).map do |index|
        IndexCoverage.new(index, where_clause.to_h, order_attributes)
      end
    end

    def value_tables
      @value_tables ||= in_filters.map do |attribute, values|
        ValueTable.new(model, attribute, values)
      end
    end
  end
end
# rubocop:enable CodeReuse/ActiveRecord