File: detection.py

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (344 lines) | stat: -rw-r--r-- 13,118 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Code for detecting bad machines from Swarming task data."""

import collections
import decimal
import functools
import logging
import math
import statistics
from typing import Generator, Iterable, Optional, Set, Tuple

from bad_machine_finder import tasks


class BadMachineList:
  """Stores bad Swarming bots and the reasons for why they were deemed bad."""

  def __init__(self):
    self.bad_machines = collections.defaultdict(list)

  def AddBadMachine(self, bot_id: str, reason: str) -> None:
    """Adds a bad machine to the list with the given reason.

    Args:
      bot_id: The bad machine name
      reason: The reason why the machine was deemed bad
    """
    self.bad_machines[bot_id].append(reason)

  def Merge(self, other: 'BadMachineList') -> None:
    """Merges another BadMachineList into this one.

    Args:
      other: The BadMachineList to get data from.
    """
    for bot_id, reasons in other.bad_machines.items():
      self.bad_machines[bot_id].extend(reasons)

  def RemoveLowConfidenceMachines(self, num_detections: int) -> None:
    """Removes machines that weren't flagged at least |num_detections| times.

    Args:
      num_detections: The minimum number times a machine needs to be flagged as
          bad in order to be kept.
    """
    trimmed_machines = collections.defaultdict(list)
    for bot_id, reasons in self.bad_machines.items():
      if len(reasons) >= num_detections:
        trimmed_machines[bot_id] = reasons
      else:
        logging.debug(
            'Bot %s removed because it was only flagged by %d detection '
            'method(s)', bot_id, len(reasons))
    self.bad_machines = trimmed_machines

  def IterMarkdown(self) -> Generator[Tuple[str, str], None, None]:
    """Iterates over contents, producing Markdown for each machine.

    Yields:
      A tuple (bot_id, markdown). |bot_id| is the ID of the Swarming bot,
      |markdown| is a Markdown string describing why the bot is bad.
    """
    # Keep a consistent order.
    bot_ids = sorted(list(self.bad_machines.keys()))
    for b in bot_ids:
      markdown_components = []
      markdown_components.append(f'  * {b}')
      reasons = self.bad_machines[b]
      for r in reasons:
        markdown_components.append(f'    * {r}')
      yield b, '\n'.join(markdown_components)


class MixinGroupedBadMachines:
  """Stores BadMachineLists for multiple mixins."""

  def __init__(self):
    self._bad_machines_by_mixin = {}

  def AddMixinData(self, mixin_name: str,
                   bad_machine_list: 'BadMachineList') -> None:
    """Adds a BadMachineList for |mixin_name|.

    Args:
      mixin_name: The name of the mixin
      bad_machine_list: The BadMachineList for the mixin
    """
    if mixin_name in self._bad_machines_by_mixin:
      raise ValueError(
          f'Bad machines for mixin {mixin_name} were already added')
    self._bad_machines_by_mixin[mixin_name] = bad_machine_list

  def GetAllBadMachineNames(self) -> Set[str]:
    """Gets all unique bad machine names across all stored mixins.

    Returns:
      A set containing all bad machine names.
    """
    bad_machine_names = set()
    for _, bad_machine_list in self._bad_machines_by_mixin.items():
      for bot_id in bad_machine_list.bad_machines.keys():
        bad_machine_names.add(bot_id)
    return bad_machine_names

  def GenerateMarkdown(self,
                       bots_to_skip: Optional[Iterable[str]] = None) -> str:
    """Generates a Markdown string describing the object's contents.

    Args:
      bots_to_skip: An optional iterable of bot names to not include in the
          Markdown content. If not provided, all bots will be included.

    Returns:
      A Markdown string describing each bad machine for each mixin.
    """
    bots_to_skip = bots_to_skip or []
    markdown_components = []
    # Guarantee a consistent ordering.
    for mixin_name in sorted(self._bad_machines_by_mixin.keys()):
      bad_machine_list = self._bad_machines_by_mixin[mixin_name]
      mixin_report_components = [
          f'Bad machines for {mixin_name}',
      ]
      for bot_id, markdown in bad_machine_list.IterMarkdown():
        if bot_id in bots_to_skip:
          continue
        mixin_report_components.append(markdown)
      if len(mixin_report_components) > 1:
        markdown_components.append('\n'.join(mixin_report_components))

    return '\n\n'.join(markdown_components)


def DetectViaStdDevOutlier(mixin_stats: tasks.MixinStats,
                           stddev_multiplier: float,
                           min_failures: int) -> 'BadMachineList':
  """Detects bad machines by looking for machines whose task failure rate is
  more than the given number of standard deviations from the fleet-wide mean.

  Args:
    mixin_stats: A tasks.MixinStats containing the task stats for the hardware
        fleet being checked.
    stddev_multiplier: A multiplier to apply to the standard deviation. If a
        machine's failure rate is greater than (fleet-wide mean) + (stddev *
        stddev_multiplier), it is considered bad.
    min_failures: The minimum number of failures a bot must have in order to be
        reported. This helps avoid cases where healthy bots are erroneously
        reported due to getting a small number of random failures in a small
        number of total tasks.

  Returns:
    A BadMachineList containing any bad machines found.
  """
  if mixin_stats.total_tasks <= 0:
    raise ValueError('mixin_stats needs to contain data')
  if stddev_multiplier < 0:
    raise ValueError('stddev_multiplier must be non-negative')

  bad_machines = BadMachineList()

  failure_rates = mixin_stats.GetOverallFailureRates()
  mean = statistics.mean(failure_rates)
  stddev = statistics.pstdev(failure_rates)
  threshold = mean + stddev_multiplier * stddev

  for bot_id, bot_stats in mixin_stats.IterBots():
    bot_failure_rate = bot_stats.overall_failure_rate
    if bot_failure_rate <= threshold:
      continue
    if bot_stats.failed_tasks < min_failures:
      logging.debug(
          'Bot %s skipped in DetectViaStdDevOutlier due to only having %d '
          'failed tasks', bot_id, bot_stats.failed_tasks)
      continue
    reason = (f'Had a failure rate of {bot_failure_rate} despite a fleet-wide '
              f'average of {mean} and a standard deviation of {stddev}.')
    bad_machines.AddBadMachine(bot_id, reason)

  return bad_machines


def DetectViaRandomChance(mixin_stats: tasks.MixinStats,
                          probability_threshold: float) -> 'BadMachineList':
  """Detects bad machines by looking for cases where it is very unlikely that
  a machine got as many failed tasks as it did through random chance. If it is
  unlikely that the failures happened due to random chance, then that means that
  the machine is bad and contributing to failures.

  Args:
    mixin_stats: A tasks.MixinStats containing the task stats for the hardware
        fleet being checked.
    probability_threshold: How unlikely it can be that a machine got its
        failures randomly and still be considered good.

  Returns:
    A BadMachineList containing any bad machines found.
  """
  if mixin_stats.total_tasks <= 0:
    raise ValueError('mixin_stats needs to contain data')
  if probability_threshold <= 0:
    raise ValueError('probability_threshold must be positive')
  if probability_threshold > 1:
    raise ValueError('probability_threshold must be <= 1')

  bad_machines = BadMachineList()

  average_failure_rate = (decimal.Decimal(mixin_stats.failed_tasks) /
                          decimal.Decimal(mixin_stats.total_tasks))
  for bot_id, bot_stats in mixin_stats.IterBots():
    p = _ChanceOfNOrMoreIndependentEvents(average_failure_rate,
                                          bot_stats.total_tasks,
                                          bot_stats.failed_tasks)
    if p >= probability_threshold:
      continue
    reason = (f'{bot_stats.failed_tasks} of {bot_stats.total_tasks} tasks '
              f'failed despite a fleet-wide average failed task rate of '
              f'{average_failure_rate}. The probability of this happening '
              f'randomly is {p}.')
    bad_machines.AddBadMachine(bot_id, reason)

  return bad_machines


def DetectViaInterquartileRange(mixin_stats: tasks.MixinStats, mixin_name: str,
                                iqr_multiplier: float,
                                min_failures: int) -> 'BadMachineList':
  """Detects bad machines by looking for for bots whose failure rate is above
  Q3 + |iqr_multiplier| * IQR, which is a standard way of looking for outliers
  in data.

  See https://en.wikipedia.org/wiki/Interquartile_range#Outliers.

  Args:
    mixin_stats: A tasks.MixinStats containing the task stats for the hardware
        fleet being checked.
    mixin_name: The name of the mixin being checked. For debugging purposes.
    iqr_multiplier: How many multiples of the IQR above the third quartile a
        failure rate has to be to be considered an outlier.
    min_failures: The minimum number of failures a bot must have in order to be
        reported. This helps avoid cases where healthy bots are erroneously
        reported due to getting a small number of random failures in a small
        number of total tasks.

  Returns:
    A BadMachineList containing any bad machines found.
  """
  if mixin_stats.total_tasks <= 0:
    raise ValueError('mixin_stats needs to contain data')
  if iqr_multiplier <= 0:
    raise ValueError('iqr_multiplier must be positive')

  bad_machines = BadMachineList()
  failure_rates = mixin_stats.GetOverallFailureRates()
  if len(failure_rates) <= 4:
    logging.info(
        'Quartiles require at least 5 samples to be meaningful. Mixin %s only '
        'provided %d samples.', mixin_name, len(failure_rates))
    return bad_machines

  quartiles = statistics.quantiles(failure_rates, n=4, method='inclusive')
  iqr = quartiles[2] - quartiles[0]

  if iqr == 0:
    logging.info(
        'Mixin %s resulted in an IQR of 0, which is not useful for detecting '
        'outliers.', mixin_name)
    return bad_machines

  upper_bound = quartiles[2] + iqr_multiplier * iqr

  for bot_id, bot_stats in mixin_stats.IterBots():
    bot_failure_rate = bot_stats.overall_failure_rate
    if bot_failure_rate <= upper_bound:
      continue
    if bot_stats.failed_tasks < min_failures:
      logging.debug(
          'Bot %s skipped in DetectViaInterquartileRange due to only having %d '
          'failed tasks', bot_id, bot_stats.failed_tasks)
      continue
    reason = (f'Failure rate of {bot_failure_rate} is above the IQR-based '
              f'upper bound of {upper_bound}.')
    bad_machines.AddBadMachine(bot_id, reason)

  return bad_machines


@functools.lru_cache(maxsize=None)
def _ChanceOfNOrMoreIndependentEvents(event_probability: decimal.Decimal,
                                      total_events: int, n: int) -> float:
  """Calculates the probability of getting |n| or more cases of independent
  events.

  Args:
    event_probability: The probability of the event happening.
    total_events: The number of times we check whether the event happened or
        not.
    n: The minimum number of times the event happened.
  """
  cumulative_probability = decimal.Decimal(0)
  for current_n in range(n, total_events + 1):
    cumulative_probability += _ChanceOfExactlyNIndependentEvents(
        event_probability, total_events, current_n)
  return float(cumulative_probability)


@functools.lru_cache(maxsize=None)
def _ChanceOfExactlyNIndependentEvents(event_probability: decimal.Decimal,
                                       total_events: int,
                                       n: int) -> decimal.Decimal:
  """Calculates the probability of getting exactly |n| cases of independent
  events.

  Args:
    event_probability: The probability of the event happening.
    total_events: The number of times we check whether the event happened or
        not.
    n: The number of times the event happened.

  Returns:
    A decimal.Decimal object containing the chance that the event happened
    exactly |n| times.
  """
  # Special case these to avoid 0**0, which results in an error when using
  # decimal.Decimal.
  # Probability of 0 occurs in practice when a configuration was 100% stable
  if event_probability == 0:
    if n == 0:
      return decimal.Decimal(1)
    return decimal.Decimal(0)
  # Event probability of 1 occurs in practice when a configuration was 0%
  # stable.
  if event_probability == 1:
    if n == total_events:
      return decimal.Decimal(1)
    return decimal.Decimal(0)

  # We use decimal.Decimal instead of float since math.comb() can produce
  # numbers that are too large to store in a float.
  combinations = decimal.Decimal(math.comb(total_events, n))
  chance_of_one_permutation = (event_probability**n *
                               (1 - event_probability)**(total_events - n))
  return combinations * chance_of_one_permutation