File: s4u-exec-ptask.cpp

package info (click to toggle)
simgrid 4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 38,980 kB
  • sloc: cpp: 123,583; ansic: 66,779; python: 8,358; java: 6,406; fortran: 6,079; f90: 5,123; xml: 4,587; sh: 2,337; perl: 1,436; makefile: 105; lisp: 49; javascript: 7; sed: 6
file content (145 lines) | stat: -rw-r--r-- 6,510 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* Copyright (c) 2017-2025. The SimGrid Team. All rights reserved.          */

/* This program is free software; you can redistribute it and/or modify it
 * under the terms of the license (GNU LGPL) which comes with this package. */

/* Parallel activities are convenient abstractions of parallel computational kernels that span over several machines.
 * To create a new one, you have to provide several things:
 *   - a vector of hosts on which the activity will execute
 *   - a vector of values, the amount of computation for each of the hosts (in flops)
 *   - a matrix of values, the amount of communication between each pair of hosts (in bytes)
 *
 * Each of these operation will be processed at the same relative speed.
 * This means that at some point in time, all sub-executions and all sub-communications will be at 20% of completion.
 * Also, they will all complete at the exact same time.
 *
 * This is obviously a simplistic abstraction, but this is very handful in a large amount of situations.
 *
 * Please note that you must have the LV07 platform model enabled to use such constructs.
 */

#include <simgrid/s4u.hpp>

XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_ptask, "Messages specific for this s4u example");
namespace sg4 = simgrid::s4u;

static void runner()
{
  /* Retrieve the list of all hosts as an array of hosts */
  auto hosts         = sg4::Engine::get_instance()->get_all_hosts();
  size_t hosts_count = hosts.size();

  std::vector<double> computation_amounts;
  std::vector<double> communication_amounts;

  /* ------[ test 1 ]----------------- */
  XBT_INFO("First, build a classical parallel activity, with 1 Gflop to execute on each node, "
           "and 10MB to exchange between each pair");

  computation_amounts.assign(hosts_count, 1e9 /*1Gflop*/);
  communication_amounts.assign(hosts_count * hosts_count, 0);
  for (size_t i = 0; i < hosts_count; i++)
    for (size_t j = i + 1; j < hosts_count; j++)
      communication_amounts[i * hosts_count + j] = 1e7; // 10 MB

  sg4::this_actor::parallel_execute(hosts, computation_amounts, communication_amounts);

  /* ------[ test 2 ]----------------- */
  XBT_INFO("We can do the same with a timeout of 10 seconds enabled.");
  computation_amounts.assign(hosts_count, 1e9 /*1Gflop*/);
  communication_amounts.assign(hosts_count * hosts_count, 0);
  for (size_t i = 0; i < hosts_count; i++)
    for (size_t j = i + 1; j < hosts_count; j++)
      communication_amounts[i * hosts_count + j] = 1e7; // 10 MB

  sg4::ExecPtr activity = sg4::this_actor::exec_init(hosts, computation_amounts, communication_amounts);
  try {
    activity->wait_for(10.0 /* timeout (in seconds)*/);
    xbt_die("Woops, this did not timeout as expected... Please report that bug.");
  } catch (const simgrid::TimeoutException&) {
    XBT_INFO("Caught the expected timeout exception.");
    activity->cancel();
  }

  /* ------[ test 3 ]----------------- */
  XBT_INFO("Then, build a parallel activity involving only computations (of different amounts) and no communication");
  computation_amounts = {3e8, 6e8, 1e9}; // 300Mflop, 600Mflop, 1Gflop
  communication_amounts.clear();         // no comm
  sg4::this_actor::parallel_execute(hosts, computation_amounts, communication_amounts);

  /* ------[ test 4 ]----------------- */
  XBT_INFO("Then, build a parallel activity with no computation nor communication (synchro only)");
  computation_amounts.clear();
  communication_amounts.clear();
  sg4::this_actor::parallel_execute(hosts, computation_amounts, communication_amounts);

  /* ------[ test 5 ]----------------- */
  XBT_INFO("Then, Monitor the execution of a parallel activity");
  computation_amounts.assign(hosts_count, 1e6 /*1Mflop*/);
  communication_amounts = {0, 1e6, 0, 0, 0, 1e6, 1e6, 0, 0};
  activity              = sg4::this_actor::exec_init(hosts, computation_amounts, communication_amounts);
  activity->start();

  while (not activity->test()) {
    XBT_INFO("Remaining flop ratio: %.0f%%", 100 * activity->get_remaining_ratio());
    sg4::this_actor::sleep_for(5);
  }
  activity->wait();

  /* ------[ test 6 ]----------------- */
  XBT_INFO("Finally, simulate a malleable task (a parallel execution that gets reconfigured after its start).");
  XBT_INFO("  - Start a regular parallel execution, with both comm and computation");
  computation_amounts.assign(hosts_count, 1e6 /*1Mflop*/);
  communication_amounts = {0, 1e6, 0, 0, 1e6, 0, 1e6, 0, 0};
  activity              = sg4::this_actor::exec_init(hosts, computation_amounts, communication_amounts);
  activity->start();

  sg4::this_actor::sleep_for(10);
  double remaining_ratio = activity->get_remaining_ratio();
  XBT_INFO("  - After 10 seconds, %.2f%% remains to be done. Change it from 3 hosts to 2 hosts only.",
           remaining_ratio * 100);
  XBT_INFO("    Let's first suspend the task.");
  activity->suspend();

  XBT_INFO("  - Now, simulate the reconfiguration (modeled as a comm from the removed host to the remaining ones).");
  std::vector<double> rescheduling_comp{0, 0, 0};
  std::vector<double> rescheduling_comm{0, 0, 0, 0, 0, 0, 25000, 25000, 0};
  sg4::this_actor::parallel_execute(hosts, rescheduling_comp, rescheduling_comm);

  XBT_INFO("  - Now, let's cancel the old task and create a new task with modified comm and computation vectors:");
  XBT_INFO("    What was already done is removed, and the load of the removed host is shared between remaining ones.");
  for (int i = 0; i < 2; i++) {
    // remove what we've done so far, for both comm and compute load
    computation_amounts[i]   *= remaining_ratio;
    communication_amounts[i] *= remaining_ratio;
    // The work from 1 must be shared between 2 remaining ones. 1/2=50% of extra work for each
    computation_amounts[i]   *= 1.5;
    communication_amounts[i] *= 1.5;
  }
  hosts.resize(2);
  computation_amounts.resize(2);
  double remaining_comm = communication_amounts[1];
  communication_amounts = {0, remaining_comm, remaining_comm, 0}; // Resizing a linearized matrix is hairly

  activity->cancel();
  activity = sg4::this_actor::exec_init(hosts, computation_amounts, communication_amounts);

  XBT_INFO("  - Done, let's wait for the task completion");
  activity->wait();

  XBT_INFO("Goodbye now!");
}

int main(int argc, char* argv[])
{
  sg4::Engine e(&argc, argv);

  xbt_assert(argc == 2, "Usage: %s <platform file>", argv[0]);

  e.load_platform(argv[1]);
  e.add_actor("test", e.host_by_name("MyHost1"), runner);

  e.run();
  XBT_INFO("Simulation done.");
  return 0;
}