File: diff.py

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (185 lines) | stat: -rw-r--r-- 7,471 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Logic for diffing two SizeInfo objects. See: ./docs/diffs.md"""

import collections
import itertools
import logging
import re

import models


_STRIP_NUMBERS_PATTERN = re.compile(r'[.0-9]+$|\$\d+')
_NORMALIZE_STAR_SYMBOLS_PATTERN = re.compile(r'\s+\d+( \(.*\))?$')


# Matches symbols that are unchanged (will generally be the majority).
# Matching by size is important for string literals, which all have the same
# name, but which one addition would shift their order.
def _Key1(s):
  # Remove non-stable numbers from symbols names. E.g.:
  # * Names that are generated by macros and that use __line__ in them.
  # * Toolchain-generated names that have .# suffixes. e.g.: ".L.ref.tmp.2".
  # * Java anonymous symbols. e.g. SingleCategoryPreferences$3#this$0
  name = _STRIP_NUMBERS_PATTERN.sub('', s.full_name)
  # Prefer source_path over object_path since object_path for native files have
  # the target_name in it (which can get renamed).
  # Also because object_path of Java lambdas symbols contains a hash.
  path = s.source_path or s.object_path
  # Use section rather than section_name since clang & gcc use
  # .data.rel.ro vs. .data.rel.ro.local.
  return s.container_name, s.section, name, path, s.size_without_padding


# Same as _Key1, but size can change.
def _Key2(s):
  return _Key1(s)[:4]


# Same as _Key2, but allow signature changes (uses name rather than full_name).
def _Key3(s):
  path = s.source_path or s.object_path
  name = _STRIP_NUMBERS_PATTERN.sub('', s.name)
  clone_idx = name.find(' [clone ')
  if clone_idx != -1:
    name = name[:clone_idx]
  if name.startswith('*'):
    # "* symbol gap 3 (bar)" -> "* symbol gaps"
    name = _NORMALIZE_STAR_SYMBOLS_PATTERN.sub('s', name)
  return s.container_name, s.section, name, path


# Match on full name, but without path (to account for file moves).
def _Key4(s):
  # For string literals that contain a prefix of the string in the name, allow
  # matching up via name + size.
  if s.full_name.startswith('"'):
    return s.container_name, s.section, s.full_name, s.size_without_padding
  if not s.IsNameUnique():
    return None
  return s.container_name, s.section, s.full_name


def _MatchSymbols(before, after, key_func, padding_by_segment):
  logging.debug('%s: Building symbol index', key_func.__name__)
  before_symbols_by_key = collections.defaultdict(list)
  for s in before:
    before_symbols_by_key[key_func(s)].append(s)

  logging.debug('%s: Creating delta symbols', key_func.__name__)
  unmatched_after = []
  delta_symbols = []
  for after_sym in after:
    key = key_func(after_sym)
    before_sym = key and before_symbols_by_key.get(key)
    if before_sym:
      before_sym = before_sym.pop(0)
      # Padding tracked in aggregate, except for padding-only symbols.
      if before_sym.size_without_padding != 0:
        segment = (before_sym.container_name, before_sym.section_name)
        padding_by_segment[segment] += (after_sym.padding_pss -
                                        before_sym.padding_pss)
      delta_symbols.append(models.DeltaSymbol(before_sym, after_sym))
    else:
      unmatched_after.append(after_sym)

  logging.debug('%s: Matched %d of %d symbols', key_func.__name__,
                len(delta_symbols), len(after))

  unmatched_before = []
  for syms in before_symbols_by_key.values():
    unmatched_before.extend(syms)
  return delta_symbols, unmatched_before, unmatched_after


def _DiffSymbolGroups(containers, before, after, is_sparse):
  # For changed symbols, padding is zeroed out. In order to not lose the
  # information entirely, store it in aggregate. These aggregations are grouped
  # by "segment names", which are (container name, section name) tuples.
  padding_by_segment = collections.defaultdict(float)

  # Usually >90% of symbols are exact matches, so all of the time is spent in
  # this first pass.
  all_deltas, before, after = _MatchSymbols(before, after, _Key1,
                                            padding_by_segment)
  for key_func in (_Key2, _Key3, _Key4):
    delta_syms, before, after = _MatchSymbols(before, after, key_func,
                                              padding_by_segment)
    all_deltas.extend(delta_syms)

  logging.debug('Creating %d unmatched symbols', len(after) + len(before))
  for after_sym in after:
    all_deltas.append(models.DeltaSymbol(None, after_sym))
  for before_sym in before:
    all_deltas.append(models.DeltaSymbol(before_sym, None))

  container_from_name = {c.name: c for c in containers}

  # Create a DeltaSymbol to represent the zero'd out padding of matched symbols.
  # Skip for sparse symbols, since this would have been done while creating
  # diff symbols.
  if not is_sparse:
    for (container_name, section_name), padding in padding_by_segment.items():
      # Values need to be integer (crbug.com/1132394).
      padding = round(padding)
      if padding != 0:
        padding_sym = models.Symbol(section_name, padding)
        delta_container = container_from_name[container_name]
        padding_sym.container = delta_container.after
        # This is after _NormalizeNames() is called, so set |full_name|,
        # |template_name|, and |name|.
        padding_sym.SetName("Overhead: aggregate padding of diff'ed symbols")
        padding_sym.padding = padding
        all_deltas.append(models.DeltaSymbol(None, padding_sym))

  return models.DeltaSymbolGroup(all_deltas)


def _DiffContainerLists(before_containers, after_containers):
  """Computes diff of Containers lists, matching names."""
  # Find ordered unique names, preferring order of |container_after|.
  pairs = collections.OrderedDict()
  for c in after_containers:
    pairs[c.name] = [models.Container.Empty(), c]
  for c in before_containers:
    if c.name in pairs:
      pairs[c.name][0] = c
    else:
      pairs[c.name] = [c, models.Container.Empty()]
  ret = []
  for before, after in pairs.values():
    ret.append(models.DeltaContainer(before=before, after=after))
  # This update newly created diff Containers, not existing ones or EMPTY.
  models.BaseContainer.AssignShortNames(ret)
  return ret


def Diff(before, after, sort=False):
  """Diffs two SizeInfo objects. Returns a DeltaSizeInfo.

  See docs/diffs.md for diffing algorithm.
  """
  assert isinstance(before, models.SizeInfo)
  assert isinstance(after, models.SizeInfo)
  containers_diff = _DiffContainerLists(before.containers, after.containers)
  is_sparse = before.is_sparse and after.is_sparse
  symbol_diff = _DiffSymbolGroups(containers_diff, before.raw_symbols,
                                  after.raw_symbols, is_sparse)
  removed_sources, added_sources = symbol_diff.GetEntireAddOrRemoveSources()
  ret = models.DeltaSizeInfo(before, after, containers_diff, symbol_diff,
                             removed_sources, added_sources)

  if sort:
    syms = ret.symbols  # Triggers clustering.
    logging.debug('Grouping')
    # Group path aliases so that functions defined in headers will be sorted
    # by their actual size rather than shown as many small symbols.
    # Grouping these is nice since adding or remove just one path alias changes
    # the PSS of all symbols (a lot of noise).
    syms = syms.GroupedByAliases(same_name_only=True)
    logging.debug('Sorting')
    ret.symbols = syms.Sorted()
  logging.debug('Diff complete')
  return ret