1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
|
#!/usr/bin/env python3
# Collect per-device btrfs filesystem errors.
# Designed to work on Debian and Centos 6 (with python2.6).
import glob
import os
import re
import subprocess
def get_btrfs_mount_points():
"""List all btrfs mount points.
Yields:
(string) filesystem mount points.
"""
with open("/proc/mounts") as f:
for line in f:
parts = line.split()
if parts[2] == "btrfs":
yield parts[1]
def get_btrfs_errors(mountpoint):
"""Get per-device errors for a btrfs mount point.
Args:
mountpoint: (string) path to a mount point.
Yields:
(device, error_type, error_count) tuples, where:
device: (string) path to block device.
error_type: (string) type of btrfs error.
error_count: (int) number of btrfs errors of a given type.
"""
p = subprocess.Popen(["btrfs", "device", "stats", mountpoint],
stdout=subprocess.PIPE)
(stdout, stderr) = p.communicate()
if p.returncode != 0:
raise RuntimeError("btrfs returned exit code %d" % p.returncode)
for line in stdout.splitlines():
if line == '':
continue
# Sample line:
# [/dev/vdb1].flush_io_errs 0
m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8"))
if not m:
raise RuntimeError("unexpected output from btrfs: '%s'" % line)
yield m.group(1), m.group(2), int(m.group(3))
def btrfs_error_metrics():
"""Collect btrfs error metrics.
Returns:
a list of strings to be exposed as Prometheus metrics.
"""
metric = "node_btrfs_errors_total"
contents = [
"# TYPE %s counter" % metric,
"# HELP %s number of btrfs errors" % metric,
]
for mountpoint in get_btrfs_mount_points():
for device, error_type, error_count in get_btrfs_errors(mountpoint):
contents.append(
'%s{mountpoint="%s",device="%s",type="%s"} %d' %
(metric, mountpoint, device, error_type, error_count))
if len(contents) > 2:
# return metrics if there are actual btrfs filesystems found
# (i.e. `contents` contains more than just TYPE and HELP).
return contents
else:
return []
def btrfs_allocation_metrics():
"""Collect btrfs allocation metrics.
Returns:
a list of strings to be exposed as Prometheus metrics.
"""
prefix = 'node_btrfs_allocation'
metric_to_filename = {
'size_bytes': 'total_bytes',
'used_bytes': 'bytes_used',
'reserved_bytes': 'bytes_reserved',
'pinned_bytes': 'bytes_pinned',
'disk_size_bytes': 'disk_total',
'disk_used_bytes': 'disk_used',
}
contents = []
for m, f in metric_to_filename.items():
contents += [
"# TYPE %s_%s gauge" % (prefix, m),
"# HELP %s_%s btrfs allocation data (%s)" % (prefix, m, f),
]
for alloc in glob.glob("/sys/fs/btrfs/*/allocation"):
fs = alloc.split('/')[4]
for type_ in ('data', 'metadata', 'system'):
for m, f in metric_to_filename.items():
filename = os.path.join(alloc, type_, f)
with open(filename) as f:
value = int(f.read().strip())
contents.append('%s_%s{fs="%s",type="%s"} %d' % (
prefix, m, fs, type_, value))
if len(contents) > 2 * len(metric_to_filename):
return contents
else:
return []
if __name__ == "__main__":
contents = btrfs_error_metrics() + btrfs_allocation_metrics()
print("\n".join(contents))
|