File: simgrid-monkey

package info (click to toggle)
simgrid 4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 38,980 kB
  • sloc: cpp: 123,583; ansic: 66,779; python: 8,358; java: 6,406; fortran: 6,079; f90: 5,123; xml: 4,587; sh: 2,337; perl: 1,436; makefile: 105; lisp: 49; javascript: 7; sed: 6
file content (143 lines) | stat: -rwxr-xr-x 6,517 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! /usr/bin/env python3

# Copyright (c) 2022-2025. The SimGrid Team. All rights reserved.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the license (GNU LGPL) which comes with this package.

# The goal is to introduce random failures in a simulation, to test SimGrid under extreme conditions.
# 
# It is made of several components.
# 
# * a plugin: cmonkey. Can be used from the command line as follows:
#   * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
#     Get information about the resource count and the timestamps of each scheduling rounds.
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
#     Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
#     Kill the link #0 after 42 seconds (using a kernel::Timer)
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/pid:0
#     Kill the actor of PID 0 after 42 seconds (using a kernel::Timer). Make sure that your actors are autorestarted.
# 
# * a python script: tools/simgrid-monkey (this file)
#   * It takes a regular SimGrid simulation as a parameter, use the cmonkey plugin to get the information about it, 
#     and then restart many runs, with one resource being turn_off() + turn_on() in each run.
#   * Each resource gets killed between each timestamps, and on each timestamp.
#   * So the amount of simulations is: 1 + (host_c+link_c+actor_c) * timestamps * 2
# 
# * Test program, written to resist these extreme conditions:
#   * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs (C++ and python)
#   * teshsuite/s4u/monkey-semaphore: tests async semaphores (C++ only)

import sys
import os
import argparse
import subprocess
import copy
import re


def get_info(cmd):
    cmd_tell = copy.deepcopy(cmd)
    cmd_tell.append("--cfg=plugin:cmonkey")
    cmd_tell.append("--cfg=cmonkey/tell:1")
    cmd_tell.append("--log=root.t:critical")
    cmd_tell.append("--log=cmonkey.t:info")
    cmd_tell.append("--log=cmonkey.fmt:%m%n")
    print(f"Get the initial info from the command.") 
    # print(f"from ``{' '.join(cmd_tell)}``")
    first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    if first_run.returncode != 0:
        msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
        msg += f"Full command was {' '.join(cmd_tell)}\n"
        if first_run.stdout is not None:
            msg += str(first_run.stdout, errors='replace')
        raise Exception(msg)

    host_count=0
    link_count=0
    actor_count=0
    timestamps=[]
    for line in str(first_run.stdout, errors='replace').split("\n"):
        if re.match("^ACTOR_COUNT=(.*)", line):
            m = re.match("^ACTOR_COUNT=(.*)", line)
            actor_count = int(m.group(1))
        if re.match("^HOST_COUNT=(.*)", line):
            m = re.match("^HOST_COUNT=(.*)", line)
            host_count = int(m.group(1))
        if re.match("^LINK_COUNT=(.*)", line):
            m = re.match("^LINK_COUNT=(.*)", line)
            link_count = int(m.group(1))
        if re.match("^TIMESTAMP=(.*)", line):
            m = re.match("^TIMESTAMP=(.*)", line)
            timestamps.append(float(m.group(1)))

    #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
    return (actor_count, host_count,  link_count,  timestamps)

parser = argparse.ArgumentParser(description='Run a SimGrid simulation, and turn off/on resources at random.')
parser.add_argument('--valgrind', help="Run the simulations in valgrind")
parser.add_argument('command', nargs='*')
args = parser.parse_args()

(actor_count, host_count,  link_count,  timestamps) = get_info(args.command)
timestamps = sorted([*{*timestamps}]) # kill duplicates
print(f"Monkey informations: actors:{actor_count} hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")

error_count = 0
test_count = 0
def do_run(cmd, extra_params, test_todo):
    global test_count, error_count
    test_count = test_count + 1
    cmd = copy.deepcopy(cmd)
    cmd.append("--cfg=plugin:cmonkey")
    for p in extra_params:
        cmd.append(p)
    print(f"Start {' '.join(cmd)}")
    sys.stdout.flush()

    run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    out = str(run.stdout, errors='replace')
    if run.returncode != 0:
        msg = f"ERROR (retcode: {run.returncode}). Output:\n"
        msg += out
        print(msg)
        sys.exit(1)
    for line in out.split("\n"):
        if re.match("==.*    in use at exit: ", line) and not re.match("==.* in use at exit: 0 bytes in 0 blocks", line):
            m = re.match("==.*    in use at exit: (.*)", line)
            print(f"LEAK SUMMARY: {m.group(1)} in use at exit")
            error_count += 1
        
        if re.match("==.* ERROR SUMMARY: ", line):
            m = re.match("==.* ERROR SUMMARY: (.*)", line)
            print(f"valgrind summary: {m.group(1)}")
            if not re.match("==.* 0 errors from 0 contexts", line):
                error_count += 1
    print (f"Test {test_count} out of {test_todo} succeded.\n")
        

def doit():
    prev_time = 0
    test_todo = 2 * len(timestamps) * (actor_count + host_count + link_count)
    for pos in range(len(timestamps)):
        now = timestamps[pos]
        for actor in range(actor_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/pid:{actor}"], test_todo)
        for host in range(host_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/host:{host}"], test_todo)
        for link in range(link_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/link:{link}"], test_todo)

        for actor in range(actor_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/pid:{actor}"], test_todo)
        for host in range(host_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/host:{host}"], test_todo)
        for link in range(link_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/link:{link}"], test_todo)
doit()

print(f"In total, the monkey found {error_count} errors.")
sys.exit(error_count)