File: iosnoop

package info (click to toggle)
perf-tools-unstable 1.0.1~20200130%2Bgit49b8cdf-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 928 kB
  • sloc: sh: 2,528; perl: 278; makefile: 20
file content (296 lines) | stat: -rwxr-xr-x 9,112 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/bin/bash
#
# iosnoop - trace block device I/O.
#           Written using Linux ftrace.
#
# This traces disk I/O at the block device interface, using the block:
# tracepoints. This can help characterize the I/O requested for the storage
# devices and their resulting performance. I/O completions can also be studied
# event-by-event for debugging disk and controller I/O scheduling issues.
#
# USAGE: ./iosnoop [-hQst] [-d device] [-i iotype] [-p pid] [-n name] [duration]
#
# Run "iosnoop -h" for full usage.
#
# REQUIREMENTS: FTRACE CONFIG, block:block_rq_* tracepoints (you may
# already have these on recent kernels).
#
# OVERHEAD: By default, iosnoop works without buffering, printing I/O events
# as they happen (uses trace_pipe), context switching and consuming CPU to do
# so. This has a limit of about 10,000 IOPS (depending on your platform), at
# which point iosnoop will be consuming 1 CPU. The duration mode uses buffering,
# and can handle much higher IOPS rates, however, the buffer has a limit of
# about 50,000 I/O, after which events will be dropped. You can tune this with
# bufsize_kb, which is per-CPU. Also note that the "-n" option is currently
# post-filtered, so all events are traced.
#
# This was written as a proof of concept for ftrace. It would be better written
# using perf_events (after some capabilities are added), which has a better
# buffering policy, or a tracer such as SystemTap or ktap.
#
# From perf-tools: https://github.com/brendangregg/perf-tools
#
# See the iosnoop(8) man page (in perf-tools) for more info.
#
# COPYRIGHT: Copyright (c) 2014 Brendan Gregg.
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License
#  as published by the Free Software Foundation; either version 2
#  of the License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software Foundation,
#  Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#  (http://www.gnu.org/copyleft/gpl.html)
#
# 12-Jul-2014	Brendan Gregg	Created this.

### default variables
tracing=/sys/kernel/debug/tracing
flock=/var/tmp/.ftrace-lock
bufsize_kb=4096
opt_duration=0; duration=; opt_name=0; name=; opt_pid=0; pid=; ftext=
opt_start=0; opt_end=0; opt_device=0; device=; opt_iotype=0; iotype=
opt_queue=0
trap ':' INT QUIT TERM PIPE HUP	# sends execution to end tracing section

function usage {
	cat <<-END >&2
	USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name]
	               [duration]
	                 -d device       # device string (eg, "202,1)
	                 -i iotype       # match type (eg, '*R*' for all reads)
	                 -n name         # process name to match on I/O issue
	                 -p PID          # PID to match on I/O issue
	                 -Q              # use queue insert as start time
	                 -s              # include start time of I/O (s)
	                 -t              # include completion time of I/O (s)
	                 -h              # this usage message
	                 duration        # duration seconds, and use buffers
	  eg,
	       iosnoop                   # watch block I/O live (unbuffered)
	       iosnoop 1                 # trace 1 sec (buffered)
	       iosnoop -Q                # include queueing time in LATms
	       iosnoop -ts               # include start and end timestamps
	       iosnoop -i '*R*'          # trace reads
	       iosnoop -p 91             # show I/O issued when PID 91 is on-CPU
	       iosnoop -Qp 91            # show I/O queued by PID 91, queue time

	See the man page and example file for more info.
END
	exit
}

function warn {
	if ! eval "$@"; then
		echo >&2 "WARNING: command failed \"$@\""
	fi
}

function end {
	# disable tracing
	echo 2>/dev/null
	echo "Ending tracing..." 2>/dev/null
	cd $tracing
	warn "echo 0 > events/block/$b_start/enable"
	warn "echo 0 > events/block/block_rq_complete/enable"
	if (( opt_device || opt_iotype || opt_pid )); then
		warn "echo 0 > events/block/$b_start/filter"
		warn "echo 0 > events/block/block_rq_complete/filter"
	fi
	warn "echo > trace"
	(( wroteflock )) && warn "rm $flock"
}

function die {
	echo >&2 "$@"
	exit 1
}

function edie {
	# die with a quiet end()
	echo >&2 "$@"
	exec >/dev/null 2>&1
	end
	exit 1
}

### process options
while getopts d:hi:n:p:Qst opt
do
	case $opt in
	d)	opt_device=1; device=$OPTARG ;;
	i)	opt_iotype=1; iotype=$OPTARG ;;
	n)	opt_name=1; name=$OPTARG ;;
	p)	opt_pid=1; pid=$OPTARG ;;
	Q)	opt_queue=1 ;;
	s)	opt_start=1 ;;
	t)	opt_end=1 ;;
	h|?)	usage ;;
	esac
done
shift $(( $OPTIND - 1 ))
if (( $# )); then
	opt_duration=1
	duration=$1
	shift
fi
if (( opt_device )); then
	major=${device%,*}
	minor=${device#*,}
	dev=$(( (major << 20) + minor ))
fi

### option logic
(( opt_pid && opt_name )) && die "ERROR: use either -p or -n."
(( opt_pid )) && ftext=" issued by PID $pid"
(( opt_name )) && ftext=" issued by process name \"$name\""
if (( opt_duration )); then
	echo "Tracing block I/O$ftext for $duration seconds (buffered)..."
else
	echo "Tracing block I/O$ftext. Ctrl-C to end."
fi
if (( opt_queue )); then
	b_start=block_rq_insert
else
	b_start=block_rq_issue
fi

### select awk
(( opt_duration )) && use=mawk || use=gawk	# workaround for mawk fflush()
[[ -x /usr/bin/$use ]] && awk=$use || awk=awk
wroteflock=1

### check permissions
cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE?
    debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)"

### ftrace lock
[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock"
echo $$ > $flock || die "ERROR: unable to write $flock."

### setup and begin tracing
echo nop > current_tracer
warn "echo $bufsize_kb > buffer_size_kb"
filter=
if (( opt_iotype )); then
	filter="rwbs ~ \"$iotype\""
fi
if (( opt_device )); then
	[[ "$filter" != "" ]] && filter="$filter && "
	filter="${filter}dev == $dev"
fi
filter_i=$filter
if (( opt_pid )); then
	[[ "$filter_i" != "" ]] && filter_i="$filter_i && "
	filter_i="${filter_i}common_pid == $pid"
	[[ "$filter" == "" ]] && filter=0
fi
if (( opt_iotype || opt_device || opt_pid )); then
	if ! echo "$filter_i" > events/block/$b_start/filter || \
	    ! echo "$filter" > events/block/block_rq_complete/filter
	then
		edie "ERROR: setting -d or -t filter. Exiting."
	fi
fi
if ! echo 1 > events/block/$b_start/enable || \
    ! echo 1 > events/block/block_rq_complete/enable; then
	edie "ERROR: enabling block I/O tracepoints. Exiting."
fi
(( opt_start )) && printf "%-15s " "STARTs"
(( opt_end )) && printf "%-15s " "ENDs"
printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n" \
    "COMM" "PID" "TYPE" "DEV" "BLOCK" "BYTES" "LATms"

#
# Determine output format. It may be one of the following (newest first):
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
# To differentiate between them, the number of header fields is counted,
# and an offset set, to skip the extra column when needed.
#
offset=$($awk 'BEGIN { o = 0; }
	$1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; }
	$2 ~ /TASK/ { print o; exit }' trace)

### print trace buffer
warn "echo > trace"
( if (( opt_duration )); then
	# wait then dump buffer
	sleep $duration
	cat trace
else
	# print buffer live
	cat trace_pipe
fi ) | $awk -v o=$offset -v opt_name=$opt_name -v name=$name \
    -v opt_duration=$opt_duration -v opt_start=$opt_start -v opt_end=$opt_end \
    -v b_start=$b_start '
	# common fields
	$1 != "#" {
		# task name can contain dashes
		comm = pid = $1
		sub(/-[0-9][0-9]*/, "", comm)
		sub(/.*-/, "", pid)
		time = $(3+o); sub(":", "", time)
		dev = $(5+o)
	}

	# block I/O request
	$1 != "#" && $0 ~ b_start {
		if (opt_name && match(comm, name) == 0)
			next
		#
		# example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar]
		# The cmd field "()" might contain multiple words (hex),
		# hence stepping from the right (NF-3).
		#
		loc = $(NF-3)
		starts[dev, loc] = time
		comms[dev, loc] = comm
		pids[dev, loc] = pid
		next
	}

	# block I/O completion
	$1 != "#" && $0 ~ /rq_complete/ {
		#
		# example: (fields1..4+o) 202,1 W () 12862256 + 8 [0]
		#
		dir = $(6+o)
		loc = $(NF-3)
		nsec = $(NF-1)

		if (starts[dev, loc] > 0) {
			latency = sprintf("%.2f",
			    1000 * (time - starts[dev, loc]))
			comm = comms[dev, loc]
			pid = pids[dev, loc]

			if (opt_start)
				printf "%-15s ", starts[dev, loc]
			if (opt_end)
				printf "%-15s ", time
			printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n",
			    comm, pid, dir, dev, loc, nsec * 512, latency
			if (!opt_duration)
				fflush()

			delete starts[dev, loc]
			delete comms[dev, loc]
			delete pids[dev, loc]
		}
		next
	}

	$0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" }
'

### end tracing
end