1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
|
# frozen_string_literal: true
require 'sidekiq/scheduled'
require 'sidekiq/api'
module Sidekiq
##
# Automatically retry jobs that fail in Sidekiq.
# Sidekiq's retry support assumes a typical development lifecycle:
#
# 0. Push some code changes with a bug in it.
# 1. Bug causes job processing to fail, Sidekiq's middleware captures
# the job and pushes it onto a retry queue.
# 2. Sidekiq retries jobs in the retry queue multiple times with
# an exponential delay, the job continues to fail.
# 3. After a few days, a developer deploys a fix. The job is
# reprocessed successfully.
# 4. Once retries are exhausted, Sidekiq will give up and move the
# job to the Dead Job Queue (aka morgue) where it must be dealt with
# manually in the Web UI.
# 5. After 6 months on the DJQ, Sidekiq will discard the job.
#
# A job looks like:
#
# { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => true }
#
# The 'retry' option also accepts a number (in place of 'true'):
#
# { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => 5 }
#
# The job will be retried this number of times before giving up. (If simply
# 'true', Sidekiq retries 25 times)
#
# We'll add a bit more data to the job to support retries:
#
# * 'queue' - the queue to use
# * 'retry_count' - number of times we've retried so far.
# * 'error_message' - the message from the exception
# * 'error_class' - the exception class
# * 'failed_at' - the first time it failed
# * 'retried_at' - the last time it was retried
# * 'backtrace' - the number of lines of error backtrace to store
#
# We don't store the backtrace by default as that can add a lot of overhead
# to the job and everyone is using an error service, right?
#
# The default number of retries is 25 which works out to about 3 weeks
# You can change the default maximum number of retries in your initializer:
#
# Sidekiq.options[:max_retries] = 7
#
# or limit the number of retries for a particular worker with:
#
# class MyWorker
# include Sidekiq::Worker
# sidekiq_options :retry => 10
# end
#
class JobRetry
class Skip < ::RuntimeError; end
include Sidekiq::Util
DEFAULT_MAX_RETRY_ATTEMPTS = 25
def initialize(options = {})
@max_retries = Sidekiq.options.merge(options).fetch(:max_retries, DEFAULT_MAX_RETRY_ATTEMPTS)
end
# The global retry handler requires only the barest of data.
# We want to be able to retry as much as possible so we don't
# require the worker to be instantiated.
def global(msg, queue)
yield
rescue Skip => ex
raise ex
rescue Sidekiq::Shutdown => ey
# ignore, will be pushed back onto queue during hard_shutdown
raise ey
rescue Exception => e
# ignore, will be pushed back onto queue during hard_shutdown
raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
if msg['retry']
attempt_retry(nil, msg, queue, e)
else
Sidekiq.death_handlers.each do |handler|
begin
handler.call(msg, e)
rescue => handler_ex
handle_exception(handler_ex, { context: "Error calling death handler", job: msg })
end
end
end
raise e
end
# The local retry support means that any errors that occur within
# this block can be associated with the given worker instance.
# This is required to support the `sidekiq_retries_exhausted` block.
#
# Note that any exception from the block is wrapped in the Skip
# exception so the global block does not reprocess the error. The
# Skip exception is unwrapped within Sidekiq::Processor#process before
# calling the handle_exception handlers.
def local(worker, msg, queue)
yield
rescue Skip => ex
raise ex
rescue Sidekiq::Shutdown => ey
# ignore, will be pushed back onto queue during hard_shutdown
raise ey
rescue Exception => e
# ignore, will be pushed back onto queue during hard_shutdown
raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
if msg['retry'] == nil
msg['retry'] = worker.class.get_sidekiq_options['retry']
end
raise e unless msg['retry']
attempt_retry(worker, msg, queue, e)
# We've handled this error associated with this job, don't
# need to handle it at the global level
raise Skip
end
private
# Note that +worker+ can be nil here if an error is raised before we can
# instantiate the worker instance. All access must be guarded and
# best effort.
def attempt_retry(worker, msg, queue, exception)
max_retry_attempts = retry_attempts_from(msg['retry'], @max_retries)
msg['queue'] = if msg['retry_queue']
msg['retry_queue']
else
queue
end
# App code can stuff all sorts of crazy binary data into the error message
# that won't convert to JSON.
m = exception.message.to_s[0, 10_000]
if m.respond_to?(:scrub!)
m.force_encoding("utf-8")
m.scrub!
end
msg['error_message'] = m
msg['error_class'] = exception.class.name
count = if msg['retry_count']
msg['retried_at'] = Time.now.to_f
msg['retry_count'] += 1
else
msg['failed_at'] = Time.now.to_f
msg['retry_count'] = 0
end
if msg['backtrace'] == true
msg['error_backtrace'] = exception.backtrace
elsif !msg['backtrace']
# do nothing
elsif msg['backtrace'].to_i != 0
msg['error_backtrace'] = exception.backtrace[0...msg['backtrace'].to_i]
end
if count < max_retry_attempts
delay = delay_for(worker, count, exception)
# Logging here can break retries if the logging device raises ENOSPC #3979
#logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
retry_at = Time.now.to_f + delay
payload = Sidekiq.dump_json(msg)
Sidekiq.redis do |conn|
conn.zadd('retry', retry_at.to_s, payload)
end
else
# Goodbye dear message, you (re)tried your best I'm sure.
retries_exhausted(worker, msg, exception)
end
end
def retries_exhausted(worker, msg, exception)
begin
block = worker && worker.sidekiq_retries_exhausted_block
block.call(msg, exception) if block
rescue => e
handle_exception(e, { context: "Error calling retries_exhausted", job: msg })
end
Sidekiq.death_handlers.each do |handler|
begin
handler.call(msg, exception)
rescue => e
handle_exception(e, { context: "Error calling death handler", job: msg })
end
end
send_to_morgue(msg) unless msg['dead'] == false
end
def send_to_morgue(msg)
logger.info { "Adding dead #{msg['class']} job #{msg['jid']}" }
payload = Sidekiq.dump_json(msg)
DeadSet.new.kill(payload, notify_failure: false)
end
def retry_attempts_from(msg_retry, default)
if msg_retry.is_a?(Integer)
msg_retry
else
default
end
end
def delay_for(worker, count, exception)
if worker && worker.sidekiq_retry_in_block
custom_retry_in = retry_in(worker, count, exception).to_i
return custom_retry_in if custom_retry_in > 0
end
seconds_to_delay(count)
end
# delayed_job uses the same basic formula
def seconds_to_delay(count)
(count ** 4) + 15 + (rand(30)*(count+1))
end
def retry_in(worker, count, exception)
begin
worker.sidekiq_retry_in_block.call(count, exception)
rescue Exception => e
handle_exception(e, { context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{worker.class.name}, falling back to default" })
nil
end
end
def exception_caused_by_shutdown?(e, checked_causes = [])
return false unless e.cause
# Handle circular causes
checked_causes << e.object_id
return false if checked_causes.include?(e.cause.object_id)
e.cause.instance_of?(Sidekiq::Shutdown) ||
exception_caused_by_shutdown?(e.cause, checked_causes)
end
end
end
|