File: replication_monitor.rb

package info (click to toggle)
ruby-pg 1.5.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,264 kB
  • sloc: ansic: 8,820; ruby: 2,809; makefile: 10
file content (222 lines) | stat: -rw-r--r-- 5,465 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# -*- ruby -*-
# vim: set noet nosta sw=4 ts=4 :
#
# Get the current WAL segment and offset from a master postgresql
# server, and compare slave servers to see how far behind they
# are in MB.  This script should be easily modified for use with
# Nagios/Mon/Monit/Zabbix/whatever, or wrapping it in a display loop,
# and is suitable for both WAL shipping or streaming forms of replication.
#
# Mahlon E. Smith <mahlon@martini.nu>
#
# First argument is the master server, all other arguments are treated
# as slave machines.
#
#	db_replication.monitor db-master.example.com ...
#

require 'ostruct'
require 'optparse'
require 'pathname'
require 'etc'
require 'pg'
require 'pp'


### A class to encapsulate the PG handles.
###
class PGMonitor

	VERSION = %q$Id$

	# When to consider a slave as 'behind', measured in WAL segments.
	# The default WAL segment size is 16, so we'll alert after
	# missing two WAL files worth of data.
	#
	LAG_ALERT = 32

	### Create a new PGMonitor object.
	###
	def initialize( opts, hosts )
		@opts        = opts
		@master      = hosts.shift
		@slaves      = hosts
		@current_wal = {}
		@failures    = []
	end

	attr_reader :opts, :current_wal, :master, :slaves, :failures


	### Perform the connections and check the lag.
	###
	def check
		# clear prior failures, get current xlog info
		@failures = []
		return unless self.get_current_wal

		# check all slaves
		self.slaves.each do |slave|
			begin
				slave_db = PG.connect(
					:dbname   => self.opts.database,
					:host     => slave,
					:port     => self.opts.port,
					:user     => self.opts.user,
					:password => self.opts.pass,
					:sslmode  => 'prefer'
				)

				xlog = slave_db.exec( 'SELECT pg_last_xlog_receive_location()' ).getvalue( 0, 0 )
				slave_db.close

				lag_in_megs = ( self.find_lag( xlog ).to_f / 1024 / 1024 ).abs
				if lag_in_megs >= LAG_ALERT
					failures << { :host => slave,
						:error => "%0.2fMB behind the master." % [ lag_in_megs ] }
				end
			rescue => err
				failures << { :host => slave, :error => err.message }
			end
		end
	end


	#########
	protected
	#########

	### Ask the master for the current xlog information, to compare
	### to slaves.  Returns true on success.  On failure, populates
	### the failures array and returns false.
	###
	def get_current_wal
		master_db = PG.connect(
			:dbname   => self.opts.database,
			:host     => self.master,
			:port     => self.opts.port,
			:user     => self.opts.user,
			:password => self.opts.pass,
			:sslmode  => 'prefer'
		)

		self.current_wal[ :segbytes ] = master_db.exec( 'SHOW wal_segment_size' ).
			getvalue( 0, 0 ).sub( /\D+/, '' ).to_i << 20

		current = master_db.exec( 'SELECT pg_current_xlog_location()' ).getvalue( 0, 0 )
		self.current_wal[ :segment ], self.current_wal[ :offset ] = current.split( /\// )

		master_db.close
		return true

	# If we can't get any of the info from the master, then there is no
	# point in a comparison with slaves.
	#
	rescue => err
		self.failures << { :host => self.master,
			:error => 'Unable to retrieve required info from the master (%s)' % [ err.message ] }

		return false
	end


	### Given an +xlog+ position from a slave server, return
	### the number of bytes the slave needs to replay before it
	### is caught up to the master.
	###
	def find_lag( xlog )
		s_segment, s_offset = xlog.split( /\// )
		m_segment  = self.current_wal[ :segment ]
		m_offset   = self.current_wal[ :offset ]
		m_segbytes = self.current_wal[ :segbytes ]

		return (( m_segment.hex - s_segment.hex ) * m_segbytes) + ( m_offset.hex - s_offset.hex )
	end

end


### Parse command line arguments.  Return a struct of global options.
###
def parse_args( args )
	options          = OpenStruct.new
	options.database = 'postgres'
	options.port     = 5432
	options.user     = Etc.getpwuid( Process.uid ).name
	options.sslmode  = 'prefer'

	opts = OptionParser.new do |opts|
		opts.banner = "Usage: #{$0} [options] <master> <slave> [slave2, slave3...]"

		opts.separator ''
		opts.separator 'Connection options:'

		opts.on( '-d', '--database DBNAME',
				"specify the database to connect to (default: \"#{options.database}\")" ) do |db|
			options.database = db
		end

		opts.on( '-h', '--host HOSTNAME', 'database server host' ) do |host|
			options.host = host
		end

		opts.on( '-p', '--port PORT', Integer,
				"database server port (default: \"#{options.port}\")" ) do |port|
			options.port = port
		end

		opts.on( '-U', '--user NAME',
				"database user name (default: \"#{options.user}\")" ) do |user|
			options.user = user
		end

		opts.on( '-W', 'force password prompt' ) do |pw|
			print 'Password: '
			begin
				system 'stty -echo'
				options.pass = $stdin.gets.chomp
			ensure
				system 'stty echo'
				puts
			end
		end

		opts.separator ''
		opts.separator 'Other options:'

		opts.on_tail( '--help', 'show this help, then exit' ) do
			$stderr.puts opts
			exit
		end

		opts.on_tail( '--version', 'output version information, then exit' ) do
			puts PGMonitor::VERSION
			exit
		end
	end

	opts.parse!( args )
	return options
end



if __FILE__ == $0
	opts = parse_args( ARGV )
	raise ArgumentError, "At least two PostgreSQL servers are required." if ARGV.length < 2
	mon = PGMonitor.new( opts, ARGV )

	mon.check
	if mon.failures.empty?
		puts "All is well!"
		exit 0
	else
		puts "Database replication delayed or broken."
		mon.failures.each do |bad|
			puts "%s: %s" % [ bad[ :host ], bad[ :error ] ]
		end
		exit 1
	end
end