1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
|
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Code for detecting bad machines from Swarming task data."""
import collections
import decimal
import functools
import logging
import math
import statistics
from typing import Generator, Iterable, Optional, Set, Tuple
from bad_machine_finder import tasks
class BadMachineList:
"""Stores bad Swarming bots and the reasons for why they were deemed bad."""
def __init__(self):
self.bad_machines = collections.defaultdict(list)
def AddBadMachine(self, bot_id: str, reason: str) -> None:
"""Adds a bad machine to the list with the given reason.
Args:
bot_id: The bad machine name
reason: The reason why the machine was deemed bad
"""
self.bad_machines[bot_id].append(reason)
def Merge(self, other: 'BadMachineList') -> None:
"""Merges another BadMachineList into this one.
Args:
other: The BadMachineList to get data from.
"""
for bot_id, reasons in other.bad_machines.items():
self.bad_machines[bot_id].extend(reasons)
def RemoveLowConfidenceMachines(self, num_detections: int) -> None:
"""Removes machines that weren't flagged at least |num_detections| times.
Args:
num_detections: The minimum number times a machine needs to be flagged as
bad in order to be kept.
"""
trimmed_machines = collections.defaultdict(list)
for bot_id, reasons in self.bad_machines.items():
if len(reasons) >= num_detections:
trimmed_machines[bot_id] = reasons
else:
logging.debug(
'Bot %s removed because it was only flagged by %d detection '
'method(s)', bot_id, len(reasons))
self.bad_machines = trimmed_machines
def IterMarkdown(self) -> Generator[Tuple[str, str], None, None]:
"""Iterates over contents, producing Markdown for each machine.
Yields:
A tuple (bot_id, markdown). |bot_id| is the ID of the Swarming bot,
|markdown| is a Markdown string describing why the bot is bad.
"""
# Keep a consistent order.
bot_ids = sorted(list(self.bad_machines.keys()))
for b in bot_ids:
markdown_components = []
markdown_components.append(f' * {b}')
reasons = self.bad_machines[b]
for r in reasons:
markdown_components.append(f' * {r}')
yield b, '\n'.join(markdown_components)
class MixinGroupedBadMachines:
"""Stores BadMachineLists for multiple mixins."""
def __init__(self):
self._bad_machines_by_mixin = {}
def AddMixinData(self, mixin_name: str,
bad_machine_list: 'BadMachineList') -> None:
"""Adds a BadMachineList for |mixin_name|.
Args:
mixin_name: The name of the mixin
bad_machine_list: The BadMachineList for the mixin
"""
if mixin_name in self._bad_machines_by_mixin:
raise ValueError(
f'Bad machines for mixin {mixin_name} were already added')
self._bad_machines_by_mixin[mixin_name] = bad_machine_list
def GetAllBadMachineNames(self) -> Set[str]:
"""Gets all unique bad machine names across all stored mixins.
Returns:
A set containing all bad machine names.
"""
bad_machine_names = set()
for _, bad_machine_list in self._bad_machines_by_mixin.items():
for bot_id in bad_machine_list.bad_machines.keys():
bad_machine_names.add(bot_id)
return bad_machine_names
def GenerateMarkdown(self,
bots_to_skip: Optional[Iterable[str]] = None) -> str:
"""Generates a Markdown string describing the object's contents.
Args:
bots_to_skip: An optional iterable of bot names to not include in the
Markdown content. If not provided, all bots will be included.
Returns:
A Markdown string describing each bad machine for each mixin.
"""
bots_to_skip = bots_to_skip or []
markdown_components = []
# Guarantee a consistent ordering.
for mixin_name in sorted(self._bad_machines_by_mixin.keys()):
bad_machine_list = self._bad_machines_by_mixin[mixin_name]
mixin_report_components = [
f'Bad machines for {mixin_name}',
]
for bot_id, markdown in bad_machine_list.IterMarkdown():
if bot_id in bots_to_skip:
continue
mixin_report_components.append(markdown)
if len(mixin_report_components) > 1:
markdown_components.append('\n'.join(mixin_report_components))
return '\n\n'.join(markdown_components)
def DetectViaStdDevOutlier(mixin_stats: tasks.MixinStats,
stddev_multiplier: float,
min_failures: int) -> 'BadMachineList':
"""Detects bad machines by looking for machines whose task failure rate is
more than the given number of standard deviations from the fleet-wide mean.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
stddev_multiplier: A multiplier to apply to the standard deviation. If a
machine's failure rate is greater than (fleet-wide mean) + (stddev *
stddev_multiplier), it is considered bad.
min_failures: The minimum number of failures a bot must have in order to be
reported. This helps avoid cases where healthy bots are erroneously
reported due to getting a small number of random failures in a small
number of total tasks.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if stddev_multiplier < 0:
raise ValueError('stddev_multiplier must be non-negative')
bad_machines = BadMachineList()
failure_rates = mixin_stats.GetOverallFailureRates()
mean = statistics.mean(failure_rates)
stddev = statistics.pstdev(failure_rates)
threshold = mean + stddev_multiplier * stddev
for bot_id, bot_stats in mixin_stats.IterBots():
bot_failure_rate = bot_stats.overall_failure_rate
if bot_failure_rate <= threshold:
continue
if bot_stats.failed_tasks < min_failures:
logging.debug(
'Bot %s skipped in DetectViaStdDevOutlier due to only having %d '
'failed tasks', bot_id, bot_stats.failed_tasks)
continue
reason = (f'Had a failure rate of {bot_failure_rate} despite a fleet-wide '
f'average of {mean} and a standard deviation of {stddev}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
def DetectViaRandomChance(mixin_stats: tasks.MixinStats,
probability_threshold: float) -> 'BadMachineList':
"""Detects bad machines by looking for cases where it is very unlikely that
a machine got as many failed tasks as it did through random chance. If it is
unlikely that the failures happened due to random chance, then that means that
the machine is bad and contributing to failures.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
probability_threshold: How unlikely it can be that a machine got its
failures randomly and still be considered good.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if probability_threshold <= 0:
raise ValueError('probability_threshold must be positive')
if probability_threshold > 1:
raise ValueError('probability_threshold must be <= 1')
bad_machines = BadMachineList()
average_failure_rate = (decimal.Decimal(mixin_stats.failed_tasks) /
decimal.Decimal(mixin_stats.total_tasks))
for bot_id, bot_stats in mixin_stats.IterBots():
p = _ChanceOfNOrMoreIndependentEvents(average_failure_rate,
bot_stats.total_tasks,
bot_stats.failed_tasks)
if p >= probability_threshold:
continue
reason = (f'{bot_stats.failed_tasks} of {bot_stats.total_tasks} tasks '
f'failed despite a fleet-wide average failed task rate of '
f'{average_failure_rate}. The probability of this happening '
f'randomly is {p}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
def DetectViaInterquartileRange(mixin_stats: tasks.MixinStats, mixin_name: str,
iqr_multiplier: float,
min_failures: int) -> 'BadMachineList':
"""Detects bad machines by looking for for bots whose failure rate is above
Q3 + |iqr_multiplier| * IQR, which is a standard way of looking for outliers
in data.
See https://en.wikipedia.org/wiki/Interquartile_range#Outliers.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
mixin_name: The name of the mixin being checked. For debugging purposes.
iqr_multiplier: How many multiples of the IQR above the third quartile a
failure rate has to be to be considered an outlier.
min_failures: The minimum number of failures a bot must have in order to be
reported. This helps avoid cases where healthy bots are erroneously
reported due to getting a small number of random failures in a small
number of total tasks.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if iqr_multiplier <= 0:
raise ValueError('iqr_multiplier must be positive')
bad_machines = BadMachineList()
failure_rates = mixin_stats.GetOverallFailureRates()
if len(failure_rates) <= 4:
logging.info(
'Quartiles require at least 5 samples to be meaningful. Mixin %s only '
'provided %d samples.', mixin_name, len(failure_rates))
return bad_machines
quartiles = statistics.quantiles(failure_rates, n=4, method='inclusive')
iqr = quartiles[2] - quartiles[0]
if iqr == 0:
logging.info(
'Mixin %s resulted in an IQR of 0, which is not useful for detecting '
'outliers.', mixin_name)
return bad_machines
upper_bound = quartiles[2] + iqr_multiplier * iqr
for bot_id, bot_stats in mixin_stats.IterBots():
bot_failure_rate = bot_stats.overall_failure_rate
if bot_failure_rate <= upper_bound:
continue
if bot_stats.failed_tasks < min_failures:
logging.debug(
'Bot %s skipped in DetectViaInterquartileRange due to only having %d '
'failed tasks', bot_id, bot_stats.failed_tasks)
continue
reason = (f'Failure rate of {bot_failure_rate} is above the IQR-based '
f'upper bound of {upper_bound}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
@functools.lru_cache(maxsize=None)
def _ChanceOfNOrMoreIndependentEvents(event_probability: decimal.Decimal,
total_events: int, n: int) -> float:
"""Calculates the probability of getting |n| or more cases of independent
events.
Args:
event_probability: The probability of the event happening.
total_events: The number of times we check whether the event happened or
not.
n: The minimum number of times the event happened.
"""
cumulative_probability = decimal.Decimal(0)
for current_n in range(n, total_events + 1):
cumulative_probability += _ChanceOfExactlyNIndependentEvents(
event_probability, total_events, current_n)
return float(cumulative_probability)
@functools.lru_cache(maxsize=None)
def _ChanceOfExactlyNIndependentEvents(event_probability: decimal.Decimal,
total_events: int,
n: int) -> decimal.Decimal:
"""Calculates the probability of getting exactly |n| cases of independent
events.
Args:
event_probability: The probability of the event happening.
total_events: The number of times we check whether the event happened or
not.
n: The number of times the event happened.
Returns:
A decimal.Decimal object containing the chance that the event happened
exactly |n| times.
"""
# Special case these to avoid 0**0, which results in an error when using
# decimal.Decimal.
# Probability of 0 occurs in practice when a configuration was 100% stable
if event_probability == 0:
if n == 0:
return decimal.Decimal(1)
return decimal.Decimal(0)
# Event probability of 1 occurs in practice when a configuration was 0%
# stable.
if event_probability == 1:
if n == total_events:
return decimal.Decimal(1)
return decimal.Decimal(0)
# We use decimal.Decimal instead of float since math.comb() can produce
# numbers that are too large to store in a float.
combinations = decimal.Decimal(math.comb(total_events, n))
chance_of_one_permutation = (event_probability**n *
(1 - event_probability)**(total_events - n))
return combinations * chance_of_one_permutation
|