File: exec_ptask.java

package info (click to toggle)
simgrid 4.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 39,192 kB
  • sloc: cpp: 124,913; ansic: 66,744; python: 8,560; java: 6,773; fortran: 6,079; f90: 5,123; xml: 4,587; sh: 2,194; perl: 1,436; makefile: 111; lisp: 49; javascript: 7; sed: 6
file content (153 lines) | stat: -rw-r--r-- 6,893 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* Copyright (c) 2017-2025. The SimGrid Team. All rights reserved.          */

/* This program is free software; you can redistribute it and/or modify it
 * under the terms of the license (GNU LGPL) which comes with this package. */

/* Parallel activities are convenient abstractions of parallel computational kernels that span over several machines.
 * To create a new one, you have to provide several things:
 *   - a vector of hosts on which the activity will execute
 *   - a vector of values, the amount of computation for each of the hosts (in flops)
 *   - a matrix of values, the amount of communication between each pair of hosts (in bytes)
 *
 * Each of these operation will be processed at the same relative speed.
 * This means that at some point in time, all sub-executions and all sub-communications will be at 20% of completion.
 * Also, they will all complete at the exact same time.
 *
 * This is obviously a simplistic abstraction, but this is very handful in a large amount of situations.
 *
 * Please note that you must have the LV07 platform model enabled to use such constructs.
 */

import org.simgrid.s4u.*;

class runner extends Actor {
  public void run() throws TimeoutException, HostFailureException
  {
    /* Retrieve the list of all hosts as an array of hosts */
    var hosts                      = this.get_engine().get_all_hosts();
    int hosts_count                = hosts.length;
    double[] computation_amounts   = new double[hosts_count];
    double[] communication_amounts = new double[hosts_count * hosts_count];

    /* ------[ test 1 ]----------------- */
    Engine.info("First, build a classical parallel activity, with 1 Gflop to execute on each node, "
                + "and 10MB to exchange between each pair");

    for (int i = 0; i < hosts_count; i++) {
      computation_amounts[i] = 1e9 /*1Gflop*/;
      for (int j = i + 1; j < hosts_count; j++)
        communication_amounts[i * hosts_count + j] = 1e7; // 10 MB
    }

    this.parallel_execute(hosts, computation_amounts, communication_amounts);

    /* ------[ test 2 ]----------------- */
    Engine.info("We can do the same with a timeout of 10 seconds enabled.");
    for (int i = 0; i < hosts_count; i++) {
      computation_amounts[i] = 1e9 /*1Gflop*/;
      for (int j = i + 1; j < hosts_count; j++)
        communication_amounts[i * hosts_count + j] = 1e7; // 10 MB
    }

    Exec activity = this.exec_init(hosts, computation_amounts, communication_amounts);
    try {
      activity.await_for(10.0 /* timeout (in seconds)*/);
      Engine.die("Woops, this did not timeout as expected... Please report that bug.");
    } catch (TimeoutException ex) {
      Engine.info("Caught the expected timeout exception.");
      activity.cancel();
    }

    /* ------[ test 3 ]----------------- */
    Engine.info(
        "Then, build a parallel activity involving only computations (of different amounts) and no communication");
    computation_amounts   = new double[] {3e8, 6e8, 1e9}; // 300Mflop, 600Mflop, 1Gflop
    communication_amounts = null;                         // no comm
    this.parallel_execute(hosts, computation_amounts, communication_amounts);

    /* ------[ test 4 ]----------------- */
    Engine.info("Then, build a parallel activity with no computation nor communication (synchro only)");
    this.parallel_execute(hosts, null, null);

    /* ------[ test 5 ]----------------- */
    Engine.info("Then, Monitor the execution of a parallel activity");
    computation_amounts = new double[hosts_count];
    for (int i = 0; i < hosts_count; i++)
      computation_amounts[i] = 1e6 /*1Mflop*/;
    communication_amounts = new double[] {0, 1e6, 0, 0, 0, 1e6, 1e6, 0, 0};
    activity              = this.exec_init(hosts, computation_amounts, communication_amounts);
    activity.start();

    while (!activity.test()) {
      Engine.info("Remaining flop ratio: %.0f%%", 100 * activity.get_remaining_ratio());
      this.sleep_for(5);
    }
    activity.await();

    /* ------[ test 6 ]----------------- */
    Engine.info("Finally, simulate a malleable task (a parallel execution that gets reconfigured after its start).");
    Engine.info("  - Start a regular parallel execution, with both comm and computation");
    computation_amounts = new double[hosts_count];
    for (int i = 0; i < hosts_count; i++)
      computation_amounts[i] = 1e6 /*1Mflop*/;
    communication_amounts = new double[] {0, 1e6, 0, 0, 1e6, 0, 1e6, 0, 0};
    activity              = this.exec_init(hosts, computation_amounts, communication_amounts);
    activity.start();

    this.sleep_for(10);
    double remaining_ratio = activity.get_remaining_ratio();
    Engine.info("  - After 10 seconds, %.2f%% remains to be done. Change it from 3 hosts to 2 hosts only.",
                remaining_ratio * 100);
    Engine.info("    Let's first suspend the task.");
    activity.suspend();

    Engine.info(
        "  - Now, simulate the reconfiguration (modeled as a comm from the removed host to the remaining ones).");
    double[] rescheduling_comp = {0, 0, 0};
    double[] rescheduling_comm = {0, 0, 0, 0, 0, 0, 25000, 25000, 0};
    this.parallel_execute(hosts, rescheduling_comp, rescheduling_comm);

    Engine.info("  - Now, let's cancel the old task and create a new task with modified comm and computation vectors:");
    Engine.info(
        "    What was already done is removed, and the load of the removed host is shared between remaining ones.");
    for (int i = 0; i < 2; i++) {
      // remove what we've done so far, for both comm and compute load
      computation_amounts[i] *= remaining_ratio;
      communication_amounts[i] *= remaining_ratio;
      // The work from 1 must be shared between 2 remaining ones. 1/2=50% of extra work for each
      computation_amounts[i] *= 1.5;
      communication_amounts[i] *= 1.5;
    }
    Host[] newhosts       = new Host[] {hosts[0], hosts[1]};
    double[] newcomp      = new double[] {computation_amounts[0], computation_amounts[1]};
    double remaining_comm = communication_amounts[1];
    communication_amounts =
        new double[] {0, remaining_comm, remaining_comm, 0}; // Resizing a linearized matrix is hairly

    activity.cancel();
    activity = this.exec_init(newhosts, newcomp, communication_amounts);

    Engine.info("  - Done, let's wait for the task completion");
    activity.await();

    Engine.info("Goodbye now!");
  }
}

public class exec_ptask {
  public static void main(String[] args)
  {
    Engine e = new Engine(args);
    if (args.length < 1)
      Engine.die("Usage: exec_ptask <platform file>");

    e.load_platform(args[0]);
    e.host_by_name("MyHost1").add_actor("test", new runner());

    e.run();
    Engine.info("Simulation done.");

    // The following call is useless in your code, but our continuous integration uses it to track memleaks
    e.force_garbage_collection();
  }
}