1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
|
#!/usr/bin/env python3
""" gpu-mon - Displays a continuously updating view of the status of all
active GPUs.
Part of the rickslab-gpu-utils package which includes gpu-ls, gpu-mon,
gpu-pac, and gpu-plot.
A utility to give the current state of all compatible GPUs. The default
behavior is to continuously update a text based table in the current window
until Ctrl-C is pressed. With the *--gui* option, a table of relevant
parameters will be updated in a Gtk window. You can specify the delay
between updates with the *--sleep N* option where N is an integer > zero
that specifies the number of seconds to sleep between updates. The
*--no_fan* option can be used to disable the reading and display of fan
information. The *--log* option is used to write all monitor data to a psv
log file. When writing to a log file, the utility will indicate this in red
at the top of the window with a message that includes the log file name. The
*--plot* will display a plot of critical GPU parameters which updates at the
specified *--sleep N* interval. If you need both the plot and monitor
displays, then using the --plot option is preferred over running both tools
as a single read of the GPUs is used to update both displays. The *--ltz*
option results in the use of local time instead of UTC. The *--verbose*
option will display progress and informational messages generated by the
utilities.
Copyright (C) 2019 RicksLab
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see <https://www.gnu.org/licenses/>.
"""
__author__ = 'RicksLab'
__copyright__ = 'Copyright (C) 2019 RicksLab'
__license__ = 'GNU General Public License'
__program_name__ = 'gpu-mon'
__maintainer__ = 'RicksLab'
__docformat__ = 'reStructuredText'
# pylint: disable=multiple-statements
# pylint: disable=line-too-long
# pylint: disable=consider-using-f-string
import argparse
import subprocess
import threading
import os
import logging
import sys
from shlex import split as shlex_split
import shutil
from time import sleep
import signal
from typing import Callable, Any, Optional
try:
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import GLib, Gtk
from GPUmodules import GPUgui
GTK = True
except (ModuleNotFoundError, ValueError) as error:
print('gi import error: {}'.format(error))
print('gi is required for {}'.format(__program_name__))
print(' In a venv, first install vext: pip install --no-cache-dir vext')
print(' Then install vext.gi: pip install --no-cache-dir vext.gi')
except ImportError as error:
print('gi import error: {}'.format(error))
print('If not using system python version, you may get a circular import error.')
print('Using non-gui version of gpu-mon.')
sleep(3)
GTK = False
from GPUmodules import __version__, __status__, __credits__
from GPUmodules import GPUmodule as Gpu
from GPUmodules.env import GUT_CONST
from GPUmodules.GPUKeys import SensorSet
LOGGER = logging.getLogger('gpu-utils')
def ctrl_c_handler(target_signal: Any, _frame: Any) -> None:
"""
Signal catcher for ctrl-c to exit monitor loop.
:param target_signal: Target signal name
:param _frame: Ignored
"""
LOGGER.debug('ctrl_c_handler (ID: %s) has been caught. Setting quit flag...', target_signal)
print('Setting quit flag...')
MonitorWindow.quit = True
signal.signal(signal.SIGINT, ctrl_c_handler)
# SEMAPHORE ############
UD_SEM = threading.Semaphore()
########################
if not GTK:
GUT_CONST.process_message('Gtk import error, Gui disabled', log_flag=True)
class MonitorWindow:
"""
PAC window with no Gtk support.
"""
quit: bool = False
gui_enabled: bool = False
item_width: int = GUT_CONST.mon_field_width
label_width: int = 12
def __init__(self, gpu_list: Optional[Gpu.GpuList] = None, devices: dict = None):
LOGGER.debug('started with Gtk disabled')
def set_quit(self, _arg2, _arg3) -> None:
""" Set quit flag when Gtk quit is selected.
"""
self.quit = True
else:
set_gtk_prop = GPUgui.GuiProps.set_gtk_prop
class MonitorWindow(Gtk.Window):
""" Custom PAC Gtk window.
"""
quit: bool = False
gui_enabled: bool = True
item_width: int = GUT_CONST.mon_field_width
label_width: int = 12
def __init__(self, gpu_list: Gpu.GpuList, devices: dict):
init_chk_value = Gtk.init_check(sys.argv)
LOGGER.debug('init_check: %s', init_chk_value)
if not init_chk_value[0]:
print('Gtk Error, Exiting')
sys.exit(-1)
Gtk.Window.__init__(self, title=GUT_CONST.gui_window_title)
self.set_border_width(0)
self.set_resizable(False)
GPUgui.GuiProps.set_style()
if GUT_CONST.icon_file:
LOGGER.debug('Icon file: [%s]', GUT_CONST.icon_file)
if os.path.isfile(GUT_CONST.icon_file):
self.set_icon_from_file(GUT_CONST.icon_file)
grid = Gtk.Grid()
self.add(grid)
col = 0
row = 0
num_amd_gpus = gpu_list.num_gpus()['total']
if GUT_CONST.debug:
debug_label = Gtk.Label(name='warn_label')
debug_label.set_markup('<big><b> DEBUG Logger Active </b></big>')
lbox = Gtk.Box(spacing=6, name='warn_box')
set_gtk_prop(debug_label, top=1, bottom=1, right=1, left=1)
lbox.pack_start(debug_label, True, True, 0)
grid.attach(lbox, 0, row, num_amd_gpus+1, 1)
row += 1
if GUT_CONST.log:
log_label = Gtk.Label(name='warn_label')
log_label.set_markup('<big><b> Logging to: </b>{}</big>'.format(GUT_CONST.log_file))
lbox = Gtk.Box(spacing=6, name='warn_box')
set_gtk_prop(log_label, top=1, bottom=1, right=1, left=1)
lbox.pack_start(log_label, True, True, 0)
grid.attach(lbox, 0, row, num_amd_gpus+1, 1)
row += 1
row_start = row
row = row_start
row_labels = {'card_num': Gtk.Label(name='white_label', halign=Gtk.Align.CENTER, valign=Gtk.Align.CENTER)}
row_labels['card_num'].set_markup('<b>Card #</b>')
for param_name, param_label in Gpu.GpuItem.table_param_labels.items():
row_labels[param_name] = Gtk.Label(name='white_label', halign=Gtk.Align.START, valign=Gtk.Align.CENTER)
row_labels[param_name].set_markup('<b>{}</b>'.format(param_label))
for row_label_item in row_labels.values():
lbox = Gtk.Box(spacing=6, name='head_box')
set_gtk_prop(lbox, top=1, bottom=1, right=1, left=1)
set_gtk_prop(row_label_item, top=1, bottom=1, right=4, left=4)
lbox.pack_start(row_label_item, True, True, 0)
grid.attach(lbox, col, row, 1, 1)
row += 1
for gpu in gpu_list.gpus():
devices[gpu.prm.uuid] = {'card_num': Gtk.Label(name='white_label')}
devices[gpu.prm.uuid]['card_num'].set_markup('<b>CARD{}</b>'.format(gpu.get_params_value('card_num')))
devices[gpu.prm.uuid]['card_num'].set_use_markup(True)
for param_name in Gpu.GpuItem.table_param_labels:
devices[gpu.prm.uuid][param_name] = Gtk.Label(label=gpu.get_params_value(str(param_name)),
name='white_label')
devices[gpu.prm.uuid][param_name].set_width_chars(self.item_width)
set_gtk_prop(devices[gpu.prm.uuid][param_name], width_chars=self.item_width)
for gui_component in devices.values():
col += 1
row = row_start
for comp_name, comp_item in gui_component.items():
comp_item.set_text('')
if comp_name == 'card_num':
lbox = Gtk.Box(spacing=6, name='head_box')
else:
lbox = Gtk.Box(spacing=6, name='med_box')
set_gtk_prop(lbox, top=1, bottom=1, right=1, left=1)
set_gtk_prop(comp_item, top=1, bottom=1, right=3, left=3, width_chars=self.item_width)
lbox.pack_start(comp_item, True, True, 0)
grid.attach(lbox, col, row, 1, 1)
row += 1
def set_quit(self, _arg2, _arg3) -> None:
"""
Set quit flag when Gtk quit is selected.
"""
self.quit = True
def update_data(gpu_list: Gpu.GpuList, devices: dict, cmd: subprocess.Popen) -> None:
"""
Update monitor data with data read from GPUs.
:param gpu_list: A gpuList object with all gpuItems
:param devices: A dictionary linking Gui items with data.
:param cmd: Subprocess return from running plot.
"""
# SEMAPHORE ############
if not UD_SEM.acquire(blocking=False):
if GUT_CONST.verbose: print('Update while updating, skipping new update')
LOGGER.debug('Update while updating, skipping new update')
return
########################
gpu_list.read_gpu_sensor_set(data_type=SensorSet.Monitor)
if GUT_CONST.log:
gpu_list.print_log(GUT_CONST.log_file_ptr)
if GUT_CONST.plot:
try:
gpu_list.print_plot(cmd.stdin)
except (OSError, KeyboardInterrupt) as except_err:
LOGGER.debug('gpu-plot has closed: [%s]', except_err)
print('gpu-plot has closed')
GUT_CONST.plot = False
# update gui
for uuid, gui_component in devices.items():
for comp_name, comp_item in gui_component.items():
if comp_name == 'card_num':
comp_item.set_markup('<b>Card{}</b>'.format(gpu_list[uuid].get_params_value('card_num')))
else:
data_value_raw = gpu_list[uuid].get_params_value(comp_name)
LOGGER.debug('raw data value: %s', data_value_raw)
data_value_raw = Gpu.format_table_value(data_value_raw, comp_name)
data_value = str(data_value_raw)[:MonitorWindow.item_width]
comp_item.set_text(data_value)
set_gtk_prop(comp_item, width_chars=MonitorWindow.item_width)
while Gtk.events_pending():
Gtk.main_iteration_do(True)
# SEMAPHORE ############
UD_SEM.release()
########################
def refresh(refreshtime: int, update_data_func: Callable, gpu_list: Gpu.GpuList, devices: dict,
cmd: subprocess.Popen, gmonitor: Gtk.Window) -> None:
"""
Method called for monitor refresh.
:param refreshtime: Amount of seconds to sleep after refresh.
:param update_data_func: Function that does actual data update.
:param gpu_list: A gpuList object with all gpuItems
:param devices: A dictionary linking Gui items with data.
:param cmd: Subprocess return from running plot.
:param gmonitor:
"""
while True:
if gmonitor.quit:
print('Quitting...')
Gtk.main_quit()
sys.exit(0)
GLib.idle_add(update_data_func, gpu_list, devices, cmd)
tst = 0.0
sleep_interval = 0.2
while tst < refreshtime:
sleep(sleep_interval)
tst += sleep_interval
def main() -> None:
"""
Flow for gpu-mon.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--about', help='README', action='store_true', default=False)
parser.add_argument('--gui', help='Display GTK Version of Monitor', action='store_true', default=False)
parser.add_argument('--log', help='Write all monitor data to logfile', action='store_true', default=False)
parser.add_argument('--plot', help='Open and write to gpu-plot', action='store_true', default=False)
parser.add_argument('--ltz', help='Use local time zone instead of UTC', action='store_true', default=False)
parser.add_argument('--verbose', help='Display informational message of GPU util progress',
action='store_true', default=False)
parser.add_argument('--sleep', help='Number of seconds to sleep between updates', type=int, default=2)
parser.add_argument('--no_fan', help='do not include fan setting options', action='store_true', default=False)
parser.add_argument('-d', '--debug', help='Debug output', action='store_true', default=False)
parser.add_argument('--pdebug', help='Plot debug output', action='store_true', default=False)
args = parser.parse_args()
# About me
if args.about:
print(__doc__)
print('Author: ', __author__)
print('Copyright: ', __copyright__)
print('Credits: ', *['\n {}'.format(item) for item in __credits__])
print('License: ', __license__)
print('Version: ', __version__)
print('Install Type: ', GUT_CONST.install_type)
print('Maintainer: ', __maintainer__)
print('Status: ', __status__)
sys.exit(0)
if int(args.sleep) <= 1:
print('Invalid value for sleep specified. Must be an integer great than zero')
sys.exit(-1)
GUT_CONST.set_args(args, __program_name__)
LOGGER.debug('########## %s %s', __program_name__, __version__)
if GUT_CONST.check_env() < 0:
print('Error in environment. Exiting...')
sys.exit(-1)
# Get list of GPUs and exit if no GPUs detected
gpu_list = Gpu.GpuList()
gpu_list.set_gpu_list()
num_gpus = gpu_list.num_gpus()
if num_gpus['total'] == 0:
print('No GPUs detected, exiting...')
sys.exit(-1)
# Display vendor and driver details
Gpu.print_driver_vendor_summary(gpu_list)
# Read data static/dynamic/info/state driver information for GPUs
gpu_list.read_gpu_sensor_set(data_type=SensorSet.All)
# Check number of readable/writable
print('All GPUs:\n {}'.format(gpu_list))
# Select GPU's appropriate for monitor
com_gpu_list = Gpu.set_mon_plot_compatible_gpu_list(gpu_list)
# Check readable and compatible GPUs
num_gpus = com_gpu_list.num_gpus()
print('Compatible GPUs:')
if num_gpus['total'] == 0:
print('No readable and compatible GPUs detected, exiting...')
sys.exit(-1)
print(' {}'.format(com_gpu_list))
if args.log:
GUT_CONST.log = True
GUT_CONST.log_file = './log_monitor_{}.txt'.format(
GUT_CONST.now(ltz=GUT_CONST.useltz).strftime('%m%d_%H%M%S'))
GUT_CONST.log_file_ptr = open(GUT_CONST.log_file, 'w', buffering=1, encoding='utf-8')
gpu_list.print_log_header(GUT_CONST.log_file_ptr)
if args.plot:
args.gui = True
if not MonitorWindow.gui_enabled:
args.gui = False
GUT_CONST.process_message('Gtk not found, Gui disabled', log_flag=True)
if args.plot:
GUT_CONST.process_message('Gtk not found, plot disabled', log_flag=True)
args.plot = False
if args.gui:
# Display Gtk style Monitor
devices = {}
gmonitor = MonitorWindow(com_gpu_list, devices)
gmonitor.connect('delete-event', gmonitor.set_quit)
gmonitor.show_all()
cmd = None
if args.plot:
GUT_CONST.plot = True
if GUT_CONST.install_type == 'repository':
plot_util = './gpu-plot'
else:
plot_util = shutil.which('gpu-plot')
if not plot_util:
plot_util = os.path.join(GUT_CONST.repository_path, 'gpu-plot')
if os.path.isfile(plot_util):
if GUT_CONST.pdebug:
cmd_str = '{} --debug --stdin --sleep {}'.format(plot_util, GUT_CONST.sleep)
else:
cmd_str = '{} --stdin --sleep {}'.format(plot_util, GUT_CONST.sleep)
# Do not use with, as cmd is meant to stay open as long as monitor is running.
cmd = subprocess.Popen(shlex_split(cmd_str), bufsize=-1, shell=False, stdin=subprocess.PIPE)
com_gpu_list.print_plot_header(cmd.stdin)
else:
print('Fatal Error: gpu-plot not found.')
# Start thread to update Monitor
threading.Thread(target=refresh, daemon=True,
args=[GUT_CONST.sleep, update_data, com_gpu_list, devices, cmd, gmonitor]).start()
Gtk.main()
else:
# Display text style Monitor
try:
while True:
com_gpu_list.read_gpu_sensor_set(data_type=SensorSet.Monitor)
os.system('clear')
if GUT_CONST.debug:
print('{}DEBUG logger is active{}'.format((GUT_CONST.mark_up_codes['red'] +
GUT_CONST.mark_up_codes['bold']),
GUT_CONST.mark_up_codes['reset']))
if GUT_CONST.log:
print('{}Logging to: {}{}'.format((GUT_CONST.mark_up_codes['red'] +
GUT_CONST.mark_up_codes['bold']),
GUT_CONST.log_file,
GUT_CONST.mark_up_codes['reset']))
com_gpu_list.print_log(GUT_CONST.log_file_ptr)
com_gpu_list.print_table()
sleep(GUT_CONST.sleep)
if MonitorWindow.quit:
sys.exit(-1)
except KeyboardInterrupt:
if GUT_CONST.log:
GUT_CONST.log_file_ptr.close()
sys.exit(0)
if __name__ == '__main__':
main()
|