File: az-helpers.sh

package info (click to toggle)
mpich 4.3.0%2Breally4.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 419,120 kB
  • sloc: ansic: 1,215,557; cpp: 74,755; javascript: 40,763; f90: 20,649; sh: 18,463; xml: 14,418; python: 14,397; perl: 13,772; makefile: 9,279; fortran: 8,063; java: 4,553; asm: 324; ruby: 176; lisp: 19; php: 8; sed: 4
file content (203 lines) | stat: -rw-r--r-- 5,322 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/bin/bash -eE

# The following functions uses Azure logging commands to report test
# details or errors. If the process is not running in Azure environment,
# no special output is generated.

# Logging commands documentation: https://docs.microsoft.com/en-us/azure/devops/pipelines/scripts/logging-commands


RUNNING_IN_AZURE="yes"
if [ -z "$AGENT_ID" ]; then
    RUNNING_IN_AZURE="no"
fi

# Report error and exit
function error() {
    msg=$1
    azure_log_issue "${msg}"
    echo "ERROR: ${msg}"
    exit 1
}

# Define Azure pipeline variable
function azure_set_variable() {
    test "x$RUNNING_IN_AZURE" = "xno" && return
    name=$1
    value=$2
    # Do not remove 'set +x': https://developercommunity.visualstudio.com/t/pipeline-variable-incorrectly-inserts-single-quote/375679#T-N394968
    set +x
    echo "##vso[task.setvariable variable=${name}]${value}"
}

# Report an issue to Azure pipeline and stop step execution
function azure_log_issue() {
    test "x$RUNNING_IN_AZURE" = "xno" && return
    msg=$1
    set +x
    echo "##vso[task.logissue type=error]${msg}"
    echo "##vso[task.complete result=Failed;]"
}

# Report an error message to Azure pipeline
function azure_log_error() {
    test "x$RUNNING_IN_AZURE" = "xno" && return
    msg=$1
    set +x
    echo "##vso[task.logissue type=error]${msg}"
}

# Report an warning message to Azure pipeline
function azure_log_warning() {
    test "x$RUNNING_IN_AZURE" = "xno" && return
    msg=$1
    set +x
    echo "##vso[task.logissue type=warning]${msg}"
}

# Complete the task as "succeeeded with issues"
function azure_complete_with_issues() {
    test "x$RUNNING_IN_AZURE" = "xno" && return
    msg=$1
    set +x
    echo "##vso[task.complete result=SucceededWithIssues;]DONE${msg}"
}

# Get IPv4 address of an interface
function get_ip() {
    iface=$1
    ip=$(ip addr show "$iface" | awk '/inet / {print $2}' | awk -F/ '{print $1}')
    echo "$ip"
}

# Get active RDMA interfaces
function get_rdma_interfaces() {
    ibdev2netdev | grep Up | while read line
    do
        ibdev=$(echo "${line}" | awk '{print $1}')
        port=$(echo "${line}" | awk '{print $3}')
        netif=$(echo "${line}" | awk '{print $5}')

        # skip devices that do not have proper gid (representors)
        if ! [ -e "/sys/class/infiniband/${ibdev}/ports/${port}/gids/0" ]
        then
            continue
        fi

        echo ${netif}
    done | sort -u
}

# Prepend each line with a timestamp
function add_timestamp() {
    set +x
    while IFS= read -r line; do
        echo "$(date -u +"%Y-%m-%dT%T.%NZ") $line"
    done
}

function az_init_modules() {
    . /etc/profile.d/modules.sh
    export MODULEPATH="/hpc/local/etc/modulefiles:$MODULEPATH"
    # Read module files (W/A if there're some network instabilities lead to autofs issues)
    find /hpc/local/etc/modulefiles > /dev/null || true
}

#
# Test if an environment module exists and load it if yes.
# Retry 5 times in case of automount failure.
# Otherwise, return error code.
#
function az_module_load() {
    module=$1
    retries=5

    until module avail -t 2>&1 | grep -q "^$module\$"; do
        if [ $retries -gt 1 ]; then
            # Attempt to refresh automount
            echo "Module $module not found, retrying..."
            ls /hpc/local > /dev/null 2>&1
            sleep 1
        else
            # Give up trying
            echo "MODULEPATH='${MODULEPATH}'"
            module avail || true
            azure_log_warning "Module $module cannot be loaded"
            return 1
        fi
        ((retries--))
    done
    module load $module
    return 0
}

#
# Safe unload for env modules (even if it doesn't exist)
#
function az_module_unload() {
    module=$1
    module unload "${module}" || true
}


#
# try load cuda modules if nvidia driver is installed
#
try_load_cuda_env() {
    num_gpus=0
    have_cuda=no
    have_gdrcopy=no
    if [ -f "/proc/driver/nvidia/version" ]; then
        have_cuda=yes
        have_gdrcopy=yes
        az_module_load dev/cuda11.4 || have_cuda=no
        az_module_load dev/gdrcopy2.3_cuda11.4 || have_gdrcopy=no
        nvidia-smi -a
        ls -l /dev/nvidia*
        num_gpus=$(nvidia-smi -L | wc -l)
        if [ "$num_gpus" -gt 0 ] && ! [ -f /sys/kernel/mm/memory_peers/nv_mem/version ]
        then
            lsmod
            azure_log_error "GPU direct driver not loaded"
        fi
    fi
}


check_release_build() {
    build_reason=$1
    build_sourceversion=$2
    title_mask=$3


    if [ "${build_reason}" == "IndividualCI" ] || \
       [ "${build_reason}" == "ResourceTrigger" ]
    then
        launch=True
    elif [ "${build_reason}" == "PullRequest" ]
    then
        launch=False
        # In case of pull request, HEAD^ is the branch commit we merge with
        range="$(git rev-parse HEAD^)..${build_sourceversion}"
        for sha1 in `git log $range --format="%h"`
        do
            title=`git log -1 --format="%s" $sha1`
            [[ "$title" == "${title_mask}"* ]] && launch=True;
        done
    fi
    set +x
    echo "##vso[task.setvariable variable=Launch;isOutput=true]${launch}"
}


#
# Return arch in the same format as Java System.getProperty("os.arch")
#
get_arch() {
    arch=$(uname -m)
    if [ "$arch" == "x86_64" ]; then
        echo "amd64"
    else
        echo "$arch"
    fi
}