File: simgrid-monkey

package info (click to toggle)
simgrid 4.0-1
links: PTS, VCS
area: main
in suites: trixie
size: 38,980 kB
sloc: cpp: 123,583; ansic: 66,779; python: 8,358; java: 6,406; fortran: 6,079; f90: 5,123; xml: 4,587; sh: 2,337; perl: 1,436; makefile: 105; lisp: 49; javascript: 7; sed: 6
file content (143 lines) | stat: -rwxr-xr-x 6,517 bytes
parent folder | download | duplicates (2)
#! /usr/bin/env python3

# Copyright (c) 2022-2025. The SimGrid Team. All rights reserved.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the license (GNU LGPL) which comes with this package.

# The goal is to introduce random failures in a simulation, to test SimGrid under extreme conditions.
# 
# It is made of several components.
# 
# * a plugin: cmonkey. Can be used from the command line as follows:
#   * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
#     Get information about the resource count and the timestamps of each scheduling rounds.
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
#     Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
#     Kill the link #0 after 42 seconds (using a kernel::Timer)
#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/pid:0
#     Kill the actor of PID 0 after 42 seconds (using a kernel::Timer). Make sure that your actors are autorestarted.
# 
# * a python script: tools/simgrid-monkey (this file)
#   * It takes a regular SimGrid simulation as a parameter, use the cmonkey plugin to get the information about it, 
#     and then restart many runs, with one resource being turn_off() + turn_on() in each run.
#   * Each resource gets killed between each timestamps, and on each timestamp.
#   * So the amount of simulations is: 1 + (host_c+link_c+actor_c) * timestamps * 2
# 
# * Test program, written to resist these extreme conditions:
#   * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs (C++ and python)
#   * teshsuite/s4u/monkey-semaphore: tests async semaphores (C++ only)

import sys
import os
import argparse
import subprocess
import copy
import re


def get_info(cmd):
    cmd_tell = copy.deepcopy(cmd)
    cmd_tell.append("--cfg=plugin:cmonkey")
    cmd_tell.append("--cfg=cmonkey/tell:1")
    cmd_tell.append("--log=root.t:critical")
    cmd_tell.append("--log=cmonkey.t:info")
    cmd_tell.append("--log=cmonkey.fmt:%m%n")
    print(f"Get the initial info from the command.") 
    # print(f"from ``{' '.join(cmd_tell)}``")
    first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    if first_run.returncode != 0:
        msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
        msg += f"Full command was {' '.join(cmd_tell)}\n"
        if first_run.stdout is not None:
            msg += str(first_run.stdout, errors='replace')
        raise Exception(msg)

    host_count=0
    link_count=0
    actor_count=0
    timestamps=[]
    for line in str(first_run.stdout, errors='replace').split("\n"):
        if re.match("^ACTOR_COUNT=(.*)", line):
            m = re.match("^ACTOR_COUNT=(.*)", line)
            actor_count = int(m.group(1))
        if re.match("^HOST_COUNT=(.*)", line):
            m = re.match("^HOST_COUNT=(.*)", line)
            host_count = int(m.group(1))
        if re.match("^LINK_COUNT=(.*)", line):
            m = re.match("^LINK_COUNT=(.*)", line)
            link_count = int(m.group(1))
        if re.match("^TIMESTAMP=(.*)", line):
            m = re.match("^TIMESTAMP=(.*)", line)
            timestamps.append(float(m.group(1)))

    #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
    return (actor_count, host_count,  link_count,  timestamps)

parser = argparse.ArgumentParser(description='Run a SimGrid simulation, and turn off/on resources at random.')
parser.add_argument('--valgrind', help="Run the simulations in valgrind")
parser.add_argument('command', nargs='*')
args = parser.parse_args()

(actor_count, host_count,  link_count,  timestamps) = get_info(args.command)
timestamps = sorted([*{*timestamps}]) # kill duplicates
print(f"Monkey informations: actors:{actor_count} hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")

error_count = 0
test_count = 0
def do_run(cmd, extra_params, test_todo):
    global test_count, error_count
    test_count = test_count + 1
    cmd = copy.deepcopy(cmd)
    cmd.append("--cfg=plugin:cmonkey")
    for p in extra_params:
        cmd.append(p)
    print(f"Start {' '.join(cmd)}")
    sys.stdout.flush()

    run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    out = str(run.stdout, errors='replace')
    if run.returncode != 0:
        msg = f"ERROR (retcode: {run.returncode}). Output:\n"
        msg += out
        print(msg)
        sys.exit(1)
    for line in out.split("\n"):
        if re.match("==.*    in use at exit: ", line) and not re.match("==.* in use at exit: 0 bytes in 0 blocks", line):
            m = re.match("==.*    in use at exit: (.*)", line)
            print(f"LEAK SUMMARY: {m.group(1)} in use at exit")
            error_count += 1
        
        if re.match("==.* ERROR SUMMARY: ", line):
            m = re.match("==.* ERROR SUMMARY: (.*)", line)
            print(f"valgrind summary: {m.group(1)}")
            if not re.match("==.* 0 errors from 0 contexts", line):
                error_count += 1
    print (f"Test {test_count} out of {test_todo} succeded.\n")
        

def doit():
    prev_time = 0
    test_todo = 2 * len(timestamps) * (actor_count + host_count + link_count)
    for pos in range(len(timestamps)):
        now = timestamps[pos]
        for actor in range(actor_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/pid:{actor}"], test_todo)
        for host in range(host_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/host:{host}"], test_todo)
        for link in range(link_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/link:{link}"], test_todo)

        for actor in range(actor_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/pid:{actor}"], test_todo)
        for host in range(host_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/host:{host}"], test_todo)
        for link in range(link_count):
            do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/link:{link}"], test_todo)
doit()

print(f"In total, the monkey found {error_count} errors.")
sys.exit(error_count)