File: check_qps.sh

package info (click to toggle)
mpich 4.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 423,384 kB
  • sloc: ansic: 1,088,434; cpp: 71,364; javascript: 40,763; f90: 22,829; sh: 17,463; perl: 14,773; xml: 14,418; python: 10,265; makefile: 9,246; fortran: 8,008; java: 4,355; asm: 324; ruby: 176; lisp: 19; php: 8; sed: 4
file content (104 lines) | stat: -rwxr-xr-x 2,506 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/sh -eE
#
# Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
#
# See file LICENSE for terms.
#

#
# This script detects potentially stuck Queue Pairs which have ci != pi.
#

VFS_UCX_PATH="/tmp/ucx"
HW_CI_FILE="hw_ci"
PREV_SW_PI_FILE="prev_sw_pi"
QP_NUM_FILE="qp_num"

# Interval between traversing QPs (in seconds)
QP_CHECK_INTERVAL=10
# "yes" - print all QPs, "no" - print only stuck QPs
PRINT_ALL_QPS=0

# Show script usage help message
usage()
{
    echo " Usage:"
    echo "  "$0" [ options ]"
    echo " Options:"
    echo "  -a           Print all QPs"
    echo "  -p <path>    Path to UCX CFS mount point (/tmp/ucx)"
    echo "  -i <seconds> Interval to check QP state"
    exit 1
}

while getopts ":ap:i:" o; do
    case "${o}" in
    a)
        PRINT_ALL_QPS=1
        ;;
    i)
        QP_CHECK_INTERVAL=${OPTARG}
        ;;
    p)
        VFS_UCX_PATH=${OPTARG}
        ;;
    *)
        usage
        ;;
    esac
done
shift $((OPTIND-1))

declare -A qp_nums
declare -A initial_hw_cis

traverse() {
    DC_TXWQ_GLOB_PATH="${VFS_UCX_PATH}/*/uct/worker/*/iface/*/dci_pool/*/*"
    RC_EP_GLOB_PATH="${VFS_UCX_PATH}/*/uct/worker/*/iface/*/ep/*"
    for file in ${DC_TXWQ_GLOB_PATH}/${QP_NUM_FILE} ${RC_EP_GLOB_PATH}/${QP_NUM_FILE}
    do
        filename=$(basename ${file})
        dir=$(dirname ${file})
        if [ -f ${dir}/${HW_CI_FILE} ] && \
            [ -f ${dir}/${PREV_SW_PI_FILE} ] ; then
            qp_num=$(<${file})

            if [ ! ${qp_nums[${qp_num}]} ] ; then
                qp_nums[${qp_num}]=${dir}
                initial_hw_cis[${qp_num}]=$(<${dir}/${HW_CI_FILE})
            fi
        fi
    done
}

print_qp_num_info() {
    for qp_num in "${!qp_nums[@]}"
    do
        dir=${qp_nums[${qp_num}]}
        if [ ! -d ${dir} ] ; then
            continue
        fi

        hw_ci=$(<${dir}/${HW_CI_FILE})
        prev_sw_pi=$(<${dir}/${PREV_SW_PI_FILE})
        initial_hw_ci=${initial_hw_cis[${qp_num}]}

        # QP is considered as stuck if (hw_ci != sw_pi) AND hw_ci hasn't been changed
        if [ ${hw_ci} -eq ${prev_sw_pi} ] || [ ${initial_hw_ci} -ne ${hw_ci} ] ; then
            result_str="ok"
            if [ ${PRINT_ALL_QPS} -eq 0 ] ; then
                continue
            fi
        else
            result_str="stuck (path=$dir)"
        fi

        echo "qp=0x${qp_num}: pi=${prev_sw_pi} ci=${hw_ci} initial_ci=${initial_hw_ci} - ${result_str}"
    done
}

traverse
sleep ${QP_CHECK_INTERVAL}
traverse

print_qp_num_info