File: retry_jobs.rb

package info (click to toggle)
ruby-sidekiq 4.2.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,064 kB
  • ctags: 754
  • sloc: ruby: 7,384; makefile: 26; sh: 4
file content (205 lines) | stat: -rw-r--r-- 7,373 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
require 'sidekiq/scheduled'
require 'sidekiq/api'

module Sidekiq
  module Middleware
    module Server
      ##
      # Automatically retry jobs that fail in Sidekiq.
      # Sidekiq's retry support assumes a typical development lifecycle:
      #
      #   0. Push some code changes with a bug in it.
      #   1. Bug causes job processing to fail, Sidekiq's middleware captures
      #      the job and pushes it onto a retry queue.
      #   2. Sidekiq retries jobs in the retry queue multiple times with
      #      an exponential delay, the job continues to fail.
      #   3. After a few days, a developer deploys a fix. The job is
      #      reprocessed successfully.
      #   4. Once retries are exhausted, Sidekiq will give up and move the
      #      job to the Dead Job Queue (aka morgue) where it must be dealt with
      #      manually in the Web UI.
      #   5. After 6 months on the DJQ, Sidekiq will discard the job.
      #
      # A job looks like:
      #
      #     { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => true }
      #
      # The 'retry' option also accepts a number (in place of 'true'):
      #
      #     { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => 5 }
      #
      # The job will be retried this number of times before giving up. (If simply
      # 'true', Sidekiq retries 25 times)
      #
      # We'll add a bit more data to the job to support retries:
      #
      #  * 'queue' - the queue to use
      #  * 'retry_count' - number of times we've retried so far.
      #  * 'error_message' - the message from the exception
      #  * 'error_class' - the exception class
      #  * 'failed_at' - the first time it failed
      #  * 'retried_at' - the last time it was retried
      #  * 'backtrace' - the number of lines of error backtrace to store
      #
      # We don't store the backtrace by default as that can add a lot of overhead
      # to the job and everyone is using an error service, right?
      #
      # The default number of retry attempts is 25 which works out to about 3 weeks
      # of retries. You can pass a value for the max number of retry attempts when
      # adding the middleware using the options hash:
      #
      #   Sidekiq.configure_server do |config|
      #     config.server_middleware do |chain|
      #       chain.add Sidekiq::Middleware::Server::RetryJobs, :max_retries => 7
      #     end
      #   end
      #
      # or limit the number of retries for a particular worker with:
      #
      #    class MyWorker
      #      include Sidekiq::Worker
      #      sidekiq_options :retry => 10
      #    end
      #
      class RetryJobs
        include Sidekiq::Util

        DEFAULT_MAX_RETRY_ATTEMPTS = 25

        def initialize(options = {})
          @max_retries = options.fetch(:max_retries, DEFAULT_MAX_RETRY_ATTEMPTS)
        end

        def call(worker, msg, queue)
          yield
        rescue Sidekiq::Shutdown
          # ignore, will be pushed back onto queue during hard_shutdown
          raise
        rescue Exception => e
          # ignore, will be pushed back onto queue during hard_shutdown
          raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)

          raise e unless msg['retry']
          attempt_retry(worker, msg, queue, e)
        end

        private

        def attempt_retry(worker, msg, queue, exception)
          max_retry_attempts = retry_attempts_from(msg['retry'], @max_retries)

          msg['queue'] = if msg['retry_queue']
            msg['retry_queue']
          else
            queue
          end

          # App code can stuff all sorts of crazy binary data into the error message
          # that won't convert to JSON.
          m = exception.message.to_s[0, 10_000]
          if m.respond_to?(:scrub!)
            m.force_encoding("utf-8")
            m.scrub!
          end

          msg['error_message'] = m
          msg['error_class'] = exception.class.name
          count = if msg['retry_count']
            msg['retried_at'] = Time.now.to_f
            msg['retry_count'] += 1
          else
            msg['failed_at'] = Time.now.to_f
            msg['retry_count'] = 0
          end

          if msg['backtrace'] == true
            msg['error_backtrace'] = exception.backtrace
          elsif !msg['backtrace']
            # do nothing
          elsif msg['backtrace'].to_i != 0
            msg['error_backtrace'] = exception.backtrace[0...msg['backtrace'].to_i]
          end

          if count < max_retry_attempts
            delay = delay_for(worker, count, exception)
            logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
            retry_at = Time.now.to_f + delay
            payload = Sidekiq.dump_json(msg)
            Sidekiq.redis do |conn|
              conn.zadd('retry', retry_at.to_s, payload)
            end
          else
            # Goodbye dear message, you (re)tried your best I'm sure.
            retries_exhausted(worker, msg, exception)
          end

          raise exception
        end

        def retries_exhausted(worker, msg, exception)
          logger.debug { "Retries exhausted for job" }
          begin
            block = worker.sidekiq_retries_exhausted_block || Sidekiq.default_retries_exhausted
            block.call(msg, exception) if block
          rescue => e
            handle_exception(e, { context: "Error calling retries_exhausted for #{worker.class}", job: msg })
          end

          send_to_morgue(msg) unless msg['dead'] == false
        end

        def send_to_morgue(msg)
          Sidekiq.logger.info { "Adding dead #{msg['class']} job #{msg['jid']}" }
          payload = Sidekiq.dump_json(msg)
          now = Time.now.to_f
          Sidekiq.redis do |conn|
            conn.multi do
              conn.zadd('dead', now, payload)
              conn.zremrangebyscore('dead', '-inf', now - DeadSet.timeout)
              conn.zremrangebyrank('dead', 0, -DeadSet.max_jobs)
            end
          end
        end

        def retry_attempts_from(msg_retry, default)
          if msg_retry.is_a?(Fixnum)
            msg_retry
          else
            default
          end
        end

        def delay_for(worker, count, exception)
          worker.sidekiq_retry_in_block? && retry_in(worker, count, exception) || seconds_to_delay(count)
        end

        # delayed_job uses the same basic formula
        def seconds_to_delay(count)
          (count ** 4) + 15 + (rand(30)*(count+1))
        end

        def retry_in(worker, count, exception)
          begin
            worker.sidekiq_retry_in_block.call(count, exception).to_i
          rescue Exception => e
            handle_exception(e, { context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{worker.class.name}, falling back to default" })
            nil
          end
        end

        def exception_caused_by_shutdown?(e, checked_causes = [])
          # In Ruby 2.1.0 only, check if exception is a result of shutdown.
          return false unless defined?(e.cause)

          # Handle circular causes
          checked_causes << e.object_id
          return false if checked_causes.include?(e.cause.object_id)

          e.cause.instance_of?(Sidekiq::Shutdown) ||
            exception_caused_by_shutdown?(e.cause, checked_causes)
        end

      end
    end
  end
end