File: ext_cgroups-check_cgroups_extensions.py

package info (click to toggle)
waagent 2.15.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 9,820 kB
  • sloc: python: 60,164; xml: 4,126; sh: 1,354; makefile: 22
file content (232 lines) | stat: -rwxr-xr-x 11,537 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import re

from assertpy import fail

from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.cgroup_helpers import verify_if_distro_supports_cgroup, \
    verify_agent_cgroup_assigned_correctly, BASE_CGROUP, get_unit_cgroup_mount_path, \
    GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, check_agent_quota_disabled, \
    check_cgroup_disabled_due_to_systemd_error, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \
    print_cgroups, get_mounted_controller_list, using_cgroupv2
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.retry import retry_if_false


def verify_custom_script_cgroup_assigned_correctly():
    """
    This method verifies that the CSE script is created expected folder after install and also checks if CSE ran under the expected cgroups
    """
    log.info("===== Verifying custom script was assigned to the correct cgroups")

    # CSE creates this folder to save the output of cgroup information where the CSE script was executed. Since CSE process exits after execution,
    # and cgroup paths gets cleaned up by the system, so this information saved at run time when the extension executed.
    check_temporary_folder_exists()

    cpu_mounted = False
    memory_mounted = False

    log.info("custom script cgroup mounts:")

    with open('/var/lib/waagent/tmp/custom_script_check') as fh:
        controllers = fh.read()
        log.info("%s", controllers)

        extension_path = "/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.Azure.Extensions.CustomScript"

        correct_cpu_mount_v1_1 = "cpu,cpuacct:{0}".format(extension_path)
        correct_cpu_mount_v1_2 = "cpuacct,cpu:{0}".format(extension_path)

        correct_memory_mount_v1 = "memory:{0}".format(extension_path)

        correct_cpu_memory_mount_v2 = "0::{0}".format(extension_path)

        cgroup_v2 = using_cgroupv2()

        for mounted_controller in controllers.split("\n"):
            if cgroup_v2:
                if correct_cpu_memory_mount_v2 in mounted_controller:
                    log.info('Custom script extension mounted under correct cgroup for CPU and Memory: %s', mounted_controller)
                    cpu_mounted = True
                    memory_mounted = True
            else:
                if correct_cpu_mount_v1_1 in mounted_controller or correct_cpu_mount_v1_2 in mounted_controller:
                    log.info('Custom script extension mounted under correct cgroup '
                          'for CPU: %s', mounted_controller)
                    cpu_mounted = True
                elif correct_memory_mount_v1 in mounted_controller:
                    log.info('Custom script extension mounted under correct cgroup '
                          'for Memory: %s', mounted_controller)
                    memory_mounted = True

        if not cpu_mounted:
            fail('Custom script not mounted correctly for CPU! Expected {0} or {1} in cgroupv1 or {2} in cgroupv2'.format(correct_cpu_mount_v1_1, correct_cpu_mount_v1_2, correct_cpu_memory_mount_v2))

        if not memory_mounted:
            fail('Custom script not mounted correctly for Memory! Expected {0} in cgroupv1 or {1} in cgroupv2'.format(correct_memory_mount_v1, correct_cpu_memory_mount_v2))


def check_temporary_folder_exists():
    tmp_folder = "/var/lib/waagent/tmp"
    if not os.path.exists(tmp_folder):
        fail("Temporary folder {0} was not created which means CSE script did not run!".format(tmp_folder))


def verify_ext_cgroup_controllers_created_on_file_system():
    """
    This method ensure that extension cgroup controllers are created on file system after extension install
    """
    log.info("===== Verifying ext cgroup controllers exist on file system")

    all_controllers_present = os.path.exists(BASE_CGROUP)
    missing_controllers_path = []
    verified_controllers_path = []

    for controller in get_mounted_controller_list():
        controller_path = os.path.join(BASE_CGROUP, controller)
        if not os.path.exists(controller_path):
            all_controllers_present = False
            missing_controllers_path.append(controller_path)
        else:
            verified_controllers_path.append(controller_path)

    if not all_controllers_present:
        fail('Expected all of the extension controller: {0} paths present in the file system after extension install. But missing cgroups paths are :{1}\n'
             'and verified cgroup paths are: {2} \nSystem mounted cgroups are \n{3}'.format(get_mounted_controller_list(), missing_controllers_path, verified_controllers_path, print_cgroups()))

    log.info('Verified all extension cgroup controller paths are present and they are: \n {0}'.format(verified_controllers_path))


def verify_extension_service_cgroup_created_on_file_system():
    """
    This method ensure that extension service cgroup paths are created on file system after running extension
    """
    log.info("===== Verifying the extension service cgroup paths exist on file system")

    # GA Test Extension Service
    gatestext_cgroup_mount_path = get_unit_cgroup_mount_path(GATESTEXT_SERVICE)
    verify_extension_service_cgroup_created(GATESTEXT_SERVICE, gatestext_cgroup_mount_path)

    # Azure Monitor Extension Service
    azuremonitoragent_cgroup_mount_path = get_unit_cgroup_mount_path(AZUREMONITORAGENT_SERVICE)
    azuremonitoragent_service_name = AZUREMONITORAGENT_SERVICE
    verify_extension_service_cgroup_created(azuremonitoragent_service_name, azuremonitoragent_cgroup_mount_path)

    log.info('Verified all extension service cgroup paths created in file system .\n')


def verify_extension_service_cgroup_created(service_name, cgroup_mount_path):
    log.info("expected extension service cgroup mount path: %s", cgroup_mount_path)

    all_controllers_present = True
    missing_cgroups_path = []
    verified_cgroups_path = []

    for controller in get_mounted_controller_list():
        # cgroup_mount_path is similar to /azure.slice/walinuxagent.service
        # cgroup_mount_path[1:] = azure.slice/walinuxagent.service
        # expected extension_service_controller_path similar to /sys/fs/cgroup/cpu/azure.slice/walinuxagent.service
        extension_service_controller_path = os.path.join(BASE_CGROUP, controller, cgroup_mount_path[1:])

        if not os.path.exists(extension_service_controller_path):
            all_controllers_present = False
            missing_cgroups_path.append(extension_service_controller_path)
        else:
            verified_cgroups_path.append(extension_service_controller_path)

    if not all_controllers_present:
        fail("Extension service: [{0}] cgroup paths couldn't be found on file system. Missing cgroup paths are: {1} \n Verified cgroup paths are: {2} \n "
             "System mounted cgroups are \n{3}".format(service_name, missing_cgroups_path, verified_cgroups_path, print_cgroups()))


def verify_ext_cgroups_tracked():
    """
    Checks if ext cgroups are tracked by the agent. This is verified by checking the agent log for the message "Started tracking cgroup {extension_name}"
    """
    log.info("===== Verifying ext cgroups tracked")

    cgroups_added_for_telemetry = []
    gatestext_cgroups_tracked = False
    azuremonitoragent_cgroups_tracked = False
    gatestext_service_cgroups_tracked = False
    azuremonitoragent_service_cgroups_tracked = False
    cgroup_tracked_pattern_re = re.compile(CGROUP_TRACKED_PATTERN)

    for record in AgentLog().read():

        # Cgroup tracking logged as
        # 2021-11-14T13:09:59.351961Z INFO ExtHandler ExtHandler Started cpu tracking cgroup Microsoft.Azure.Extensions.Edp.GATestExtGo-1.0.0.2
        # [/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.Azure.Extensions.Edp.GATestExtGo_1.0.0.2.slice]
        cgroup_tracked_match = cgroup_tracked_pattern_re.findall(record.message)
        if len(cgroup_tracked_match) != 0:
            name, path = cgroup_tracked_match[0][1], cgroup_tracked_match[0][2]
            if name.startswith(GATESTEXT_FULL_NAME):
                gatestext_cgroups_tracked = True
            elif name.startswith(AZUREMONITOREXT_FULL_NAME):
                azuremonitoragent_cgroups_tracked = True
            elif name.startswith(GATESTEXT_SERVICE):
                gatestext_service_cgroups_tracked = True
            elif name.startswith(AZUREMONITORAGENT_SERVICE):
                azuremonitoragent_service_cgroups_tracked = True
            cgroups_added_for_telemetry.append((name, path))

    # agent, gatest extension, azuremonitor extension and extension service cgroups
    if len(cgroups_added_for_telemetry) < 1:
        fail('Expected cgroups were not tracked, according to the agent log. '
                        'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry))

    if not gatestext_cgroups_tracked:
        fail('Expected gatestext cgroups were not tracked, according to the agent log. '
                        'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry))

    if not azuremonitoragent_cgroups_tracked:
        fail('Expected azuremonitoragent cgroups were not tracked, according to the agent log. '
                        'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry))

    if not gatestext_service_cgroups_tracked:
        fail('Expected gatestext service cgroups were not tracked, according to the agent log. '
                        'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry))

    if not azuremonitoragent_service_cgroups_tracked:
        fail('Expected azuremonitoragent service cgroups were not tracked, according to the agent log. '
                        'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry))

    log.info("Extension cgroups tracked as expected\n%s", cgroups_added_for_telemetry)


def main():
    verify_if_distro_supports_cgroup()
    verify_ext_cgroup_controllers_created_on_file_system()
    verify_custom_script_cgroup_assigned_correctly()
    verify_agent_cgroup_assigned_correctly()
    verify_extension_service_cgroup_created_on_file_system()
    verify_ext_cgroups_tracked()


try:
    main()
except Exception as e:
    # It is possible that agent cgroup can be disabled and reset the quotas if the extension failed to start using systemd-run. In that case, we should ignore the validation
    if check_cgroup_disabled_due_to_systemd_error() and retry_if_false(check_agent_quota_disabled):
        log.info("Cgroup is disabled due to systemd error while invoking the extension, ignoring ext cgroups validations")
    else:
        raise