File: threadlist.cpp

package info (click to toggle)
dmtcp 2.6.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 6,496 kB
  • sloc: cpp: 33,592; ansic: 28,099; sh: 6,735; makefile: 1,950; perl: 1,690; python: 1,241; asm: 138; java: 13
file content (1083 lines) | stat: -rw-r--r-- 39,166 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
#include <pthread.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <semaphore.h>
#include <sys/resource.h>
#include <linux/version.h>
#include "config.h"
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) || defined(HAS_PR_SET_PTRACER)
# include <sys/prctl.h>
#endif
#include "threadlist.h"
#include "siginfo.h"
#include "dmtcpalloc.h"
#include "syscallwrappers.h"
#include "mtcpinterface.h"
#include "ckptserializer.h"
#include "uniquepid.h"
#include "jalloc.h"
#include "jassert.h"
#include "util.h"
#include "mtcp/mtcp_header.h"

// For i386 and x86_64, SETJMP currently has bugs.  Don't turn this
//   on for them until they are debugged.
// Default is to use  setcontext/getcontext.
#if defined(__arm__) || defined(__aarch64__)
# define SETJMP /* setcontext/getcontext not defined for ARM glibc */
#endif

#ifdef SETJMP
# include <setjmp.h>
#else
# include <ucontext.h>
#endif


using namespace dmtcp;

//Globals
volatile bool restoreInProgress = false;
Thread *motherofall = NULL;
void **motherofall_saved_sp = NULL;
ThreadTLSInfo *motherofall_tlsInfo = NULL;
pid_t motherpid = 0;
sigset_t sigpending_global;
Thread *activeThreads = NULL;
void *saved_sysinfo;
MYINFO_GS_T myinfo_gs __attribute__ ((visibility ("hidden")));

static const char* DMTCP_PRGNAME_PREFIX = "DMTCP:";

static Thread *threads_freelist = NULL;
static pthread_mutex_t threadlistLock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t threadStateLock = PTHREAD_MUTEX_INITIALIZER;

static pthread_rwlock_t *threadResumeLock = NULL;

static __thread Thread *curThread = NULL;
static Thread *ckptThread = NULL;
static int numUserThreads = 0;
static bool originalstartup;

extern bool sem_launch_first_time;
extern sem_t sem_launch; // allocated in coordinatorapi.cpp
static sem_t semNotifyCkptThread;
static sem_t semWaitForCkptThreadSignal;

static void *checkpointhread (void *dummy);
static void suspendThreads();
static void stopthisthread(int sig);
static int restarthread(void *threadv);
static int Thread_UpdateState(Thread *th,
                              ThreadState newval,
                              ThreadState oldval);
static void Thread_SaveSigState(Thread *th);
static void Thread_RestoreSigState(Thread *th);

// Copied from src/plugin/pid/pid_syscallsreal.c
// Without this, libdmtcp.so will depend on libdmtcp_plugin.so being loaded
static pid_t _real_getpid(void)
{
  JWARNING("_real_getpid")
          .Text("FIXME: _real_getpid returning virtual pid, not real pid.");
  // libc caches pid of the process and hence after restart, libc:getpid()
  // returns the pre-ckpt value.
  return (pid_t)_real_syscall(SYS_getpid);
}
// Copied from src/plugin/pid/pid.c and .../pid/pid_syscallsreal.c
// Without this, libdmtcp.so will depend on libdmtcp_plugin.so being loaded
LIB_PRIVATE
pid_t dmtcp_get_real_pid()
{
  return _real_getpid();
}

/*****************************************************************************
 *
 * Lock and unlock the 'activeThreads' list
 *
 *****************************************************************************/
static void lock_threads (void) {
  JASSERT(_real_pthread_mutex_lock(&threadlistLock) == 0) (JASSERT_ERRNO);
}
static void unlk_threads (void) {
  JASSERT(_real_pthread_mutex_unlock(&threadlistLock) == 0) (JASSERT_ERRNO);
}

/*****************************************************************************
 *
 * We will use the region beyond the end of stack for our temporary stack.
 * glibc sigsetjmp will mangle pointers;  We need the unmangled pointer.
 * So, we can't rely on parsing the jmpbuf for the saved sp.
 *
 *****************************************************************************/
static void save_sp(void **sp)
{
#if defined(__i386__) || defined(__x86_64__)
  asm volatile (CLEAN_FOR_64_BIT(mov %%esp,%0)
		: "=g" (*sp)
                : : "memory");
#elif defined(__arm__) || defined(__aarch64__)
  asm volatile ("mov %0,sp"
		: "=r" (*sp)
                : : "memory");
#else
# error "assembly instruction not translated"
#endif
}

/*****************************************************************************
 *
 * Get _real_ tid/pid
 *
 *****************************************************************************/

/*****************************************************************************
 *
 * New process. Empty the activeThreads list
 *
 *****************************************************************************/
void ThreadList::resetOnFork()
{
  lock_threads();
  while (activeThreads != NULL) {
    ThreadList::threadIsDead(activeThreads); // takes care of updating "activeThreads" ptr.
  }
  unlk_threads();
}

/*****************************************************************************
 *
 *  This routine must be called at startup time to initiate checkpointing
 *
 *****************************************************************************/
void ThreadList::init()
{
  /* Save this process's pid.  Then verify that the TLS has it where it should
   * be. When we do a restore, we will have to modify each thread's TLS with the
   * new motherpid. We also assume that GS uses the first GDT entry for its
   * descriptor.
   */

  /* libc/getpid can lie if we had used kernel fork() instead of libc fork(). */
  motherpid = THREAD_REAL_TID();
  TLSInfo_VerifyPidTid(motherpid, motherpid);

  SigInfo::setupCkptSigHandler(&stopthisthread);

  // CONTEXT:  updateTid() resets curThread only if it's non-NULL.
  //   ... -> initializeMtcpEngine() -> ThreadList::init() -> updateTid()
  // See addToActiveList() for more information.
  curThread = NULL;

  /* Set up caller as one of our threads so we can work on it */
  motherofall = ThreadList::getNewThread();
  motherofall_saved_sp = &motherofall->saved_sp;
  motherofall_tlsInfo = &motherofall->tlsInfo;
  updateTid(motherofall);

  sem_init(&sem_launch, 0, 0);
  sem_init(&semNotifyCkptThread, 0, 0);
  sem_init(&semWaitForCkptThreadSignal, 0, 0);

  originalstartup = true;
  pthread_t checkpointhreadid;
  /* Spawn off a thread that will perform the checkpoints from time to time */
  JASSERT(pthread_create(&checkpointhreadid, NULL, checkpointhread, NULL) == 0);

  /* Stop until checkpoint thread has finished initializing.
   * Some programs (like gcl) implement their own glibc functions in
   * a non-thread-safe manner.  In case we're using non-thread-safe glibc,
   * don't run the checkpoint thread and user thread at the same time.
   */
  errno = 0;
  while (-1 == sem_wait(&sem_launch) && errno == EINTR)
    errno = 0;
  sem_destroy(&sem_launch);
}

/*****************************************************************************
 *
 *****************************************************************************/
// Called from:  threadwrappers.cpp:__clone()
void ThreadList::initThread(Thread* th, int (*fn)(void*), void *arg, int flags,
                            int *ptid, int *ctid)
{
  /* Save exactly what the caller is supplying */
  th->fn    = fn;
  th->arg   = arg;
  th->flags = flags;
  th->ptid  = ptid;
  th->ctid  = ctid;
  th->next  = NULL;
  th->state = ST_RUNNING;
  th->procname[0] = '\0';
}

/*****************************************************************************
 *
 * Thread exited/exiting.
 *
 *****************************************************************************/
void ThreadList::threadExit()
{
  curThread->state = ST_ZOMBIE;
}

/*****************************************************************************
 *
 *****************************************************************************/
void ThreadList::updateTid(Thread *th)
{
  if (curThread == NULL)
    curThread = th;
  th->tid = THREAD_REAL_TID();
  th->virtual_tid = dmtcp_gettid();
  JLOG(DMTCP)("starting thread") (th->tid) (th->virtual_tid);

  /* libpthread may recycle the thread stacks after the thread exits (due to
   * return, pthread_exit, or pthread_cancel) by reusing them for a different
   * thread created by a subsequent call to pthread_create().
   *
   * Part of thread-stack also contains the "struct pthread" with pid and tid
   * as member fields. While reusing the stack for the new thread, the tid
   * field is reset but the pid field is left unchanged (under the assumption
   * that pid never changes). This causes a problem if the thread exited before
   * checkpoint and the new thread is created after restart and hence the pid
   * field contains the wrong value (pre-ckpt pid as opposed to current-pid).
   *
   * The solution is to put the motherpid in the pid slot every time a new
   * thread is created to make sure that struct pthread has the correct value.
   */
  TLSInfo_UpdatePid();

  // Check and remove any thread descriptor which has the same tid as ours.
  // Also, remove any dead threads from the list.
  ThreadList::addToActiveList(th);
}

/*************************************************************************
 *
 *  Send a signal to ckpt-thread to wake it up from select call and exit.
 *
 *************************************************************************/
void ThreadList::killCkpthread()
{
  JLOG(DMTCP)("Kill checkpointhread") (ckptThread->tid);
  THREAD_TGKILL(motherpid, ckptThread->tid, SigInfo::ckptSignal());
}

/*************************************************************************
 *
 *  Prepare MTCP Header
 *
 *************************************************************************/
static void prepareMtcpHeader(MtcpHeader *mtcpHdr)
{
  memset(mtcpHdr, 0, sizeof(*mtcpHdr));
  strncpy(mtcpHdr->signature, MTCP_SIGNATURE, strlen(MTCP_SIGNATURE) + 1);
  mtcpHdr->saved_brk = sbrk(0);
  // TODO: Now that we have a separate mtcp dir, the code dealing with
  // restoreBuf should go in there.
  mtcpHdr->restore_addr = (void*) ProcessInfo::instance().restoreBufAddr();
  mtcpHdr->restore_size = ProcessInfo::instance().restoreBufLen();

  mtcpHdr->vdsoStart = (void*) ProcessInfo::instance().vdsoStart();
  mtcpHdr->vdsoEnd = (void*) ProcessInfo::instance().vdsoEnd();
  mtcpHdr->vvarStart = (void*) ProcessInfo::instance().vvarStart();
  mtcpHdr->vvarEnd = (void*) ProcessInfo::instance().vvarEnd();
  mtcpHdr->stackEnd = (void*) ProcessInfo::instance().stackEnd();

  mtcpHdr->post_restart = &ThreadList::postRestart;
  mtcpHdr->post_restart_debug = &ThreadList::postRestartDebug;
  memcpy(&mtcpHdr->motherofall_tls_info,
         &motherofall->tlsInfo,
         sizeof(motherofall->tlsInfo));
  mtcpHdr->tls_pid_offset = TLSInfo_GetPidOffset();
  mtcpHdr->tls_tid_offset = TLSInfo_GetTidOffset();
  mtcpHdr->myinfo_gs = myinfo_gs;
}

/*************************************************************************
 *
 *  This executes as a thread.  It sleeps for the checkpoint interval
 *    seconds, then wakes to write the checkpoint file.
 *
 *************************************************************************/
static void *checkpointhread (void *dummy)
{
  /* This is the start function of the checkpoint thread.
   * We also call sigsetjmp/getcontext to get a snapshot of this call frame,
   * since we will never exit this call frame.  We always return
   * to this call frame at time of startup, on restart.  Hence, restart
   * will forget any modifications to our local variables since restart.
   */

  ckptThread = curThread;
  ckptThread->state = ST_CKPNTHREAD;
  // Important:  we set this in the ckpt thread to avoid a race,
  //     since: (i) the ckpt thread must read this; and (ii) if we had
  //     set it earlier, it could be invoked and modified earlier
  //     inside a generic command like CoordinatorAPI::recvMsgFromCoordi).
  sem_launch_first_time = true;

  /* For checkpoint thread, we want to block delivery of all but some special
   * signals
   */
  {
    /*
     * For the checkpoint thread, we should not block SIGSETXID which is used
     * by the setsid family of system calls to change the session leader. Glibc
     * uses this signal to notify the process threads of the change in session
     * leader information. This signal is not documented and is used internally
     * by glibc. It is defined in <glibc-src-root>/nptl/pthreadP.h
     * screen was getting affected by this since it used setsid to change the
     * session leaders.
     * Similarly, SIGCANCEL/SIGTIMER is undocumented, but used by glibc.
     */
#define SIGSETXID (__SIGRTMIN + 1)
#define SIGCANCEL (__SIGRTMIN) /* aka SIGTIMER */
    sigset_t set;

    sigfillset(&set);
    sigdelset(&set, SIGSETXID);
    sigdelset(&set, SIGCANCEL);

    JASSERT(pthread_sigmask(SIG_SETMASK, &set, NULL) == 0);
  }

  Thread_SaveSigState(ckptThread);
  TLSInfo_SaveTLSState(&ckptThread->tlsInfo);

  /* Set up our restart point.  I.e., we get jumped to here after a restore. */
#ifdef SETJMP
  JASSERT(sigsetjmp(ckptThread->jmpbuf, 1) >= 0) (JASSERT_ERRNO);
#else
  JASSERT(getcontext(&ckptThread->savctx) == 0) (JASSERT_ERRNO);
#endif
  save_sp(&ckptThread->saved_sp);
  JLOG(DMTCP)("after sigsetjmp/getcontext")
    (curThread->tid) (curThread->virtual_tid) (curThread->saved_sp);

  if (originalstartup) {
    originalstartup = false;
  } else {
    /* We are being restored.  Wait for all other threads to finish being
     * restored before resuming checkpointing.
     */
    JLOG(DMTCP)("waiting for other threads after restore");
    ThreadList::waitForAllRestored(ckptThread);
    JLOG(DMTCP)("resuming after restore");
  }

  /* This is a sleep-checkpoint-resume loop by the checkpoint thread.
   * On restart, we arrive back at getcontext, above, and then re-enter the loop.
   */
  while (1) {
    /* Wait a while between writing checkpoint files */
    JLOG(DMTCP)("before callbackSleepBetweenCheckpoint(0)");
    callbackSleepBetweenCheckpoint(0);

    restoreInProgress = false;

    // We need to reinitialize the lock.
    pthread_rwlock_t rwLock = PTHREAD_RWLOCK_INITIALIZER;
    threadResumeLock = &rwLock;
    JASSERT(_real_pthread_rwlock_wrlock(threadResumeLock) == 0) (JASSERT_ERRNO);

    suspendThreads();
    SigInfo::saveSigHandlers();
    /* Do this once, same for all threads.  But restore for each thread. */
    if (TLSInfo_HaveThreadSysinfoOffset())
      saved_sysinfo = TLSInfo_GetThreadSysinfo();

    /* All other threads halted in 'stopthisthread' routine (they are all
     * in state ST_SUSPENDED).  It's safe to write checkpoint file now.
     */

    // Update generation, in case user callback calls dmtcp_get_generation().
    uint32_t computation_generation =
               SharedData::getCompId()._computation_generation;
    ProcessInfo::instance().set_generation(computation_generation);

    JLOG(DMTCP)("before callbackSleepBetweenCheckpoint(0)");
    callbackPreCheckpoint();

    // Remove stale threads from activeThreads list.
    ThreadList::emptyFreeList();

    MtcpHeader mtcpHdr;
    prepareMtcpHeader(&mtcpHdr);
    /* That's it, folks.  We just did the checkpoint.  After this, we will meet
     *   on the flip side of checkpoint.
     */
    CkptSerializer::writeCkptImage(&mtcpHdr, sizeof(mtcpHdr));

    /* NOTE: This code is only for the checkpoint thread.  If you're looking for
     *      what the user threads do at checkpoint time, see:  stopthisthread()
     *
     * There are two ways for the checkpoint thread to return from a checkpoint:
     *                 resume and restart
     * If we're here, we just resume'd after checkpoint.  It's the same process.
     * If we chose checkpoint, 'bin/mtcp_restart' created a new process.  The
     *   source code is in 'src/mtcp'.  The program 'bin/mtcp_restart' will map
     *   our memory into the new process, and then meet us back here by calling
     *   the function specified by 'mtcpHdr->post_restart':
     *                                        ThreadList::postRestart().
     *   Actually, postRestart() will start the user threads and then call
     *   restarthread() for the 'motherofall' thread.  Then, restarthread()
     *   will call setcontext(), in order to arrive back at getcontext() here
     *   in this function, just before the 'while(1)' loop.
     * FIXME:  The 'motherofall' thread is the primary thread of the process.
     *   On launch, 'motherofall' was the user thread executing main().
     *   and the checkpoint thread was the second thread.  But now,
     *   motherofall will be the checkpoint thread.  Why do we switch at the
     *   time of restart?  Should we fix this?
     */
    JLOG(DMTCP)("before callbackPostCheckpoint(false, NULL)");
    callbackPostCheckpoint(false, NULL);

    /* Resume all threads. */
    JLOG(DMTCP)("resuming everything");
    JASSERT(_real_pthread_rwlock_unlock(threadResumeLock) == 0) (JASSERT_ERRNO);
    JLOG(DMTCP)("everything resumed");
  }
  return NULL;
}

static void suspendThreads()
{
  int needrescan;
  Thread *thread;
  Thread *next;

  /* Halt all other threads - force them to call stopthisthread
   * If any have blocked checkpointing, wait for them to unblock before
   * signalling
   */
  lock_threads();
  do {
    needrescan = 0;
    numUserThreads = 0;
    for (thread = activeThreads; thread != NULL; thread = next) {
      next = thread->next;
      int ret;
      /* Do various things based on thread's state */
      switch (thread->state) {

        case ST_RUNNING:
          /* Thread is running. Send it a signal so it will call stopthisthread.
           * We will need to rescan (hopefully it will be suspended by then)
           */
          if (Thread_UpdateState(thread, ST_SIGNALED, ST_RUNNING)) {
            if (THREAD_TGKILL(motherpid, thread->tid, SigInfo::ckptSignal()) < 0) {
              JASSERT(errno == ESRCH) (JASSERT_ERRNO) (thread->tid)
                .Text("error signalling thread");
              ThreadList::threadIsDead(thread);
            } else {
              needrescan = 1;
            }
          }
          break;

        case ST_ZOMBIE:
          ret = THREAD_TGKILL(motherpid, thread->tid, 0);
          JASSERT(ret == 0 || errno == ESRCH);
          if (ret == -1 && errno == ESRCH) {
            ThreadList::threadIsDead(thread);
          }
          break;

        case ST_SIGNALED:
          if (THREAD_TGKILL(motherpid, thread->tid, 0) == -1 && errno == ESRCH) {
            ThreadList::threadIsDead(thread);
          } else {
            needrescan = 1;
          }
          break;

        case ST_SUSPINPROG:
          numUserThreads++;
          break;

        case ST_SUSPENDED:
          numUserThreads++;
          break;

        case ST_CKPNTHREAD:
          break;

        default:
          JASSERT(false);
      }
    }
    if (needrescan) usleep(10);
  } while (needrescan);
  unlk_threads();

  for (int i = 0; i < numUserThreads; i++) {
    sem_wait(&semNotifyCkptThread);
  }

  JASSERT(activeThreads != NULL);
  JLOG(DMTCP)("everything suspended") (numUserThreads);
}

/*************************************************************************
 *
 *  Signal handler for user threads.
 *
 *************************************************************************/
void stopthisthread (int signum)
{
  // If this is checkpoint thread, exit immediately
  if (curThread == ckptThread) return;

  /* Possible state change scenarios:
   * 1. STOPSIGNAL received from ckpt-thread. In this case, the ckpt-thread
   * already changed the state to ST_SIGNALED. No need to check for locks.
   * Proceed normally.
   *
   * 2. STOPSIGNAL received from Superior thread. In this case we change the
   * state to ST_SIGNALED, if currently in ST_RUNNING. If we are holding
   * any locks (callback_holds_any_locks), we return from the signal handler.
   *
   * 3. STOPSIGNAL raised by this thread itself, after releasing all the locks.
   * In this case, we had already changed the state to ST_SIGNALED as a
   * result of step (2), so the ckpt-thread will never send us a signal.
   *
   * 4. STOPSIGNAL received from Superior thread. Ckpt-threads sends a signal
   * before we had a chance to change state from ST_RUNNING ->
   * ST_SIGNALED. This puts the STOPSIGNAL in the queue. The ckpt-thread will
   * later call sigaction(STOPSIGNAL, SIG_IGN) followed by
   * sigaction(STOPSIGNAL, stopthisthread) to discard all pending signals.
   */
  if (Thread_UpdateState(curThread, ST_SIGNALED, ST_RUNNING)) {
    int retval;
    callbackHoldsAnyLocks(&retval);
    if (retval) return;
  }

  // make sure we don't get called twice for same thread
  if (Thread_UpdateState(curThread, ST_SUSPINPROG, ST_SIGNALED)) {

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
    JWARNING(prctl(PR_GET_NAME, curThread->procname) != -1) (JASSERT_ERRNO)
      .Text("prctl(PR_GET_NAME, ...) failed");
#endif

    Thread_SaveSigState(curThread); // save sig state (and block sig delivery)
    TLSInfo_SaveTLSState(&curThread->tlsInfo); // save thread local storage state

    /* Set up our restart point, ie, we get jumped to here after a restore */
#ifdef SETJMP
    JASSERT(sigsetjmp(curThread->jmpbuf, 1) >= 0);
#else
    JASSERT(getcontext(&curThread->savctx) == 0);
#endif
    save_sp(&curThread->saved_sp);

    JLOG(DMTCP)("Thread after sigsetjmp/getcontext")
      (curThread->tid) (curThread->virtual_tid)
      (curThread->saved_sp) (__builtin_return_address(0));

    if (!restoreInProgress) {
      /* We are a user thread and all context is saved.
       * Wait for ckpt thread to write ckpt, and resume.
       */

      /* This sets a static variable in dmtcp.  It must be passed
       * from this user thread to ckpt thread before writing ckpt image
       */
      if (dmtcp_ptrace_enabled == NULL) {
        callbackPreSuspendUserThread();
      }

      /* Tell the checkpoint thread that we're all saved away */
      JASSERT(Thread_UpdateState(curThread, ST_SUSPENDED, ST_SUSPINPROG));
      sem_post(&semNotifyCkptThread);

      /* This sets a static variable in dmtcp.  It must be passed
       * from this user thread to ckpt thread before writing ckpt image
       */
      if (dmtcp_ptrace_enabled != NULL && dmtcp_ptrace_enabled()) {
        callbackPreSuspendUserThread();
      }

      /* Then wait for the ckpt thread to write the ckpt file then wake us up */
      JLOG(DMTCP)("User thread suspended") (curThread->tid);

      // We can't use sem_wait here because sem_wait registers a cleanup
      // handler before going into blocking wait. The handler is popped before
      // returning from it. However, on restart, the thread will do a longjump
      // and thus will never come out of the sem_wait, thus the handler is
      // never popped. This causes a problem later on during pthread_exit. The
      // pthread_exit routine executes all registered cleanup handlers.
      // However, the sem_wait cleanup handler is now invalid and thus we get a
      // segfault.
      // The change in sem_wait behavior was first introduce in glibc 2.21.
      JASSERT(_real_pthread_rwlock_rdlock(threadResumeLock) == 0)
        (JASSERT_ERRNO);
      JASSERT(_real_pthread_rwlock_unlock(threadResumeLock) == 0)
        (JASSERT_ERRNO);

      JLOG(DMTCP)("User thread resuming") (curThread->tid);
    } else {
      /* Else restoreinprog >= 1;  This stuff executes to do a restart */
      ThreadList::waitForAllRestored(curThread);

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
      if (!Util::strStartsWith(curThread->procname, DMTCP_PRGNAME_PREFIX)) {
        // Add the "DMTCP:" prefix.
        string newName = string(DMTCP_PRGNAME_PREFIX) + curThread->procname;
        strncpy(curThread->procname,
                newName.c_str(),
                sizeof(curThread->procname));
        // Add a NULL at the end to make sure the string terminates in all cases
        curThread->procname[sizeof(curThread->procname) - 1] = '\0';
      }
      JASSERT(prctl(PR_SET_NAME, curThread->procname) != -1 || errno == EINVAL)
        (curThread->procname) (JASSERT_ERRNO)
        .Text ("prctl(PR_SET_NAME, ...) failed");
#endif
      JLOG(DMTCP)("User thread restored") (curThread->tid);
    }

    JASSERT(Thread_UpdateState(curThread, ST_RUNNING, ST_SUSPENDED));


    callbackPreResumeUserThread(restoreInProgress);
    JLOG(DMTCP)("User thread returning to user code")
      (curThread->tid) (__builtin_return_address(0));
  }
}

/*****************************************************************************
 *
 *  Wait for all threads to finish restoring their context, then release them
 *  all to continue on their way.
 *
 *****************************************************************************/
void ThreadList::waitForAllRestored(Thread *thread)
{
  if (thread == ckptThread) {
    int i;
    for (i = 0; i < numUserThreads; i++) {
      sem_wait(&semNotifyCkptThread);
    }

    // Now that all threads have been created, restore the signal handler. We
    // need to do it before calling callbackPostCheckpoint() because that
    // routine will invoke restart hooks for all plugins. Some of the plugins
    // might perform tasks that could potentially generate a signal. For
    // example, the timer plugin may restore a timer which will fire right away,
    // and not having an appropriate signal handler could kill the process.
    SigInfo::restoreSigHandlers();

    JLOG(DMTCP)("before callbackPostCheckpoint(isRestart=true)");
    callbackPostCheckpoint(true, NULL); //(isRestart,mtcpRestoreArgvStartAddr);
    JLOG(DMTCP)("after callbackPostCheckpoint(isRestart=true)");

    /* raise the signals which were pending for the entire process at the time
     * of checkpoint. It is assumed that if a signal is pending for all threads
     * including the ckpt-thread, then it was sent to the process as opposed to
     * sent to individual threads.
     */
    for (i = SIGRTMAX; i > 0; --i) {
      if (sigismember(&sigpending_global, i) == 1) {
        kill(getpid(), i);
      }
    }

    // if this was last of all, wake everyone up
    for (i = 0; i < numUserThreads; i++) {
      sem_post(&semWaitForCkptThreadSignal);
    }
  } else {
    sem_post(&semNotifyCkptThread);
    sem_wait(&semWaitForCkptThreadSignal);
    Thread_RestoreSigState(thread);
  }

  if (thread == motherofall) {
    /* If DMTCP_RESTART_PAUSE==4, wait for gdb attach.*/
    char * pause_param = getenv("DMTCP_RESTART_PAUSE");
    if (pause_param == NULL) {
      pause_param = getenv("MTCP_RESTART_PAUSE");
    }
    if (pause_param != NULL && pause_param[0] == '4' &&
        pause_param[1] == '\0') {
#ifdef HAS_PR_SET_PTRACER
      prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // For: gdb attach
#endif // ifdef HAS_PR_SET_PTRACER
      int dummy = 1;
      while (dummy);
#ifdef HAS_PR_SET_PTRACER
      prctl(PR_SET_PTRACER, 0, 0, 0, 0); // Revert permission to default.
#endif // ifdef HAS_PR_SET_PTRACER
    }
  }
}

/*****************************************************************************
 *
 *****************************************************************************/
void
ThreadList::postRestartDebug(int restartPause)
{ // Don't try to print before debugging.  Who knows what is working yet?
  int dummy = 1;
#ifndef DEBUG
  // printf may fail, but we'll risk it to let user know this:
  printf("\n** DMTCP: It appears DMTCP not configured with '--enable-debug'\n");
  printf("**        If GDB doesn't show source, re-configure and re-compile\n");
#endif
  if (restartPause == 1) {
    // If we're here, user set env. to DMTCP_RESTART_PAUSE==0; is expecting this
    while (dummy);
    // User should have done GDB attach if we're here.
#ifdef HAS_PR_SET_PTRACER
    prctl(PR_SET_PTRACER, 0, 0, 0, 0); // Revert to default: no ptracer
#endif
  }
  static char restartPauseStr[2];
  restartPauseStr[0] = '0' + restartPause;
  restartPauseStr[1] = '\0';
  setenv("DMTCP_RESTART_PAUSE", restartPauseStr, 1);
  postRestart();
}

// threadlist.h sets these as defaulkt arguments: restartPause=0
void
ThreadList::postRestart(void)
{
  Thread *thread;
  sigset_t tmp;

  /* If DMTCP_RESTART_PAUSE==2, wait for gdb attach. */
  char * pause_param = getenv("DMTCP_RESTART_PAUSE");
  if (pause_param == NULL) {
    pause_param = getenv("MTCP_RESTART_PAUSE");
  }
  if (pause_param != NULL && pause_param[0] == '2' && pause_param[1] == '\0') {
#ifdef HAS_PR_SET_PTRACER
    prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // Allow 'gdb attach'
#endif // ifdef HAS_PR_SET_PTRACER
    // In src/mtcp_restart.c, we printed to user:
    // "Stopping due to env. var DMTCP_RESTART_PAUSE or MTCP_RESTART_PAUSE ..."
    int dummy = 1;
    while (dummy);
#ifdef HAS_PR_SET_PTRACER
    prctl(PR_SET_PTRACER, 0, 0, 0, 0); ; // Revert permission to default.
#endif
  }

  /* On restart, if the system has a different limit on open file descriptors,
   * we need to reset the base protected fd and the coordinator socket.
   */
  Util::setProtectedFdBase();
  CoordinatorAPI::instance().resetCoordSocketFd();

  SharedData::postRestart();

  /* Fill in the new mother process id */
  motherpid = THREAD_REAL_TID();
  motherofall->tid = motherpid;

  restoreInProgress = true;

  Util::allowGdbDebug(DEBUG_POST_RESTART);

  sigfillset(&tmp);
  for (thread = activeThreads; thread != NULL; thread = thread->next) {
    struct MtcpRestartThreadArg mtcpRestartThreadArg;
    sigandset(&sigpending_global, &tmp, &(thread->sigpending));
    tmp = sigpending_global;

    if (thread == motherofall) continue;

    /* DMTCP needs to know virtual_tid of the thread being recreated by the
     *  following clone() call.
     *
     * Threads are created by using syscall which is intercepted by DMTCP and
     *  the virtual_tid is sent to DMTCP as a field of MtcpRestartThreadArg
     *  structure. DMTCP will automatically extract the actual argument
     *  (clonearg->arg) from clone_arg and will pass it on to the real
     *  clone call.
     */
    void *clonearg = thread;
    if (dmtcp_real_to_virtual_pid != NULL) {
      mtcpRestartThreadArg.arg = thread;
      mtcpRestartThreadArg.virtualTid = thread->virtual_tid;
      clonearg = &mtcpRestartThreadArg;
    }

    /* Create the thread so it can finish restoring itself. */
    pid_t tid = _real_clone(restarthread,
                            // -128 for red zone
                            (void*)((char*)thread->saved_sp - 128),
                            /* Don't do CLONE_SETTLS (it'll puke).  We do it
                             * later via restoreTLSState. */
                            thread->flags & ~CLONE_SETTLS,
                            clonearg, thread->ptid, NULL, thread->ctid);

    JASSERT (tid > 0); // (JASSERT_ERRNO) .Text("Error recreating thread");
    JLOG(DMTCP)("Thread recreated") (thread->tid) (tid);
  }
  restarthread (motherofall);
}

/*****************************************************************************
 *
 *****************************************************************************/
static int restarthread (void *threadv)
{
  Thread *thread = (Thread*) threadv;
  thread->tid = THREAD_REAL_TID();
  // This function and related ones are defined in src/mtcp/restore_libc.c
  TLSInfo_RestoreTLSState(&thread->tlsInfo);

  if (TLSInfo_HaveThreadSysinfoOffset())
    TLSInfo_SetThreadSysinfo(saved_sysinfo);

  if (thread == motherofall) { // if this is a user thread
    /* If DMTCP_RESTART_PAUSE==3, wait for gdb attach.*/
    char * pause_param = getenv("DMTCP_RESTART_PAUSE");
    if (pause_param == NULL) {
      pause_param = getenv("MTCP_RESTART_PAUSE");
    }
    if (pause_param != NULL && pause_param[0] == '3' &&
        pause_param[1] == '\0') {
#ifdef HAS_PR_SET_PTRACER
      prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // For: gdb attach
#endif // ifdef HAS_PR_SET_PTRACER
      // In src/mtcp_restart.c, we printed to user:
      // "Stopping due to env. var DMTCP_RESTART_PAUSE or MTCP_RESTART_PAUSE .."
      int dummy = 1;
      while (dummy);
#ifdef HAS_PR_SET_PTRACER
      prctl(PR_SET_PTRACER, 0, 0, 0, 0); // Revert permission to default.
#endif // ifdef HAS_PR_SET_PTRACER
    }
  }

  /* Jump to the stopthisthread routine just after sigsetjmp/getcontext call.
   * Note that if this is the restored checkpointhread, it jumps to the
   * checkpointhread routine
   */
  JLOG(DMTCP)("calling siglongjmp/setcontext") (thread->tid) (thread->virtual_tid);
#ifdef SETJMP
  siglongjmp(thread->jmpbuf, 1); /* Shouldn't return */
#else
  setcontext(&thread->savctx); /* Shouldn't return */
#endif
  JASSERT(false);
  return (0); /* NOTREACHED : stop compiler warning */
}

/*****************************************************************************
 *
 *****************************************************************************/
int Thread_UpdateState(Thread *th, ThreadState newval, ThreadState oldval)
{
  int res = 0;
  JASSERT(_real_pthread_mutex_lock(&threadStateLock) == 0);
  if (oldval == th->state) {;
    th->state = newval;
    res = 1;
  }
  JASSERT(_real_pthread_mutex_unlock(&threadStateLock) == 0);
  return res;
}

/*****************************************************************************
 *
 *  Save signal mask and list of pending signals delivery
 *
 *****************************************************************************/
void Thread_SaveSigState(Thread *th)
{
  // Save signal block mask
  JASSERT(pthread_sigmask (SIG_SETMASK, NULL, &th->sigblockmask) == 0);

  // Save pending signals
  sigpending(&th->sigpending);
}

/*****************************************************************************
 *
 *  Restore signal mask and all pending signals
 *
 *****************************************************************************/
void Thread_RestoreSigState (Thread *th)
{
  int i;
  JLOG(DMTCP)("restoring signal mask for thread") (th->virtual_tid);
  JASSERT(pthread_sigmask (SIG_SETMASK, &th->sigblockmask, NULL) == 0);

  // Raise the signals which were pending for only this thread at the time of
  // checkpoint.
  for (i = SIGRTMAX; i > 0; --i) {
    if (sigismember(&th->sigpending, i)  == 1  &&
        sigismember(&th->sigblockmask, i) == 1 &&
        sigismember(&sigpending_global, i) == 0 &&
        i != dmtcp_get_ckpt_signal()) {
      if (i != SIGCHLD) {
        JNOTE("\n*** WARNING:  SIGCHLD was delivered prior to ckpt.\n"
               "*** Will raise it on restart.  If not desired, change\n"
               "*** this line raising SIGCHLD.");
      }
      raise(i);
    }
  }
}


/*****************************************************************************
 *
 * If there is a thread descriptor with the same tid, it must be from a dead
 * thread. Remove it now.
 *
 *****************************************************************************/
void ThreadList::addToActiveList(Thread *th)
{
  int tid;
  Thread *thread;
  Thread *next_thread;

  lock_threads();
  // CONTEXT:  After fork(), we called:
  //   ... -> initializeMtcpEngine() -> ThreadList::init() -> updateTid()
  //       -> addToActiveList()
  //   NOTE:  After a call to fork(), only the calling thread continues to live.
  //   Before initializeMtcpEngine() called init(), it called:
  //   ... -> initializeMtcpEngine() -> ThreadSync::initMotherOfAll() ->
  //       -> ThreadSync::initThread()
  //   Logically, we would have set 'curThread = NULL;; inside
  //     ThreadSync::initThread(), but it's inconvenient since curThread
  //     is static (file-private).
  //   So, updateTid() created the new thread descriptor.  We make sure
  //     to set curThread to th, the new descriptor, now, in case it wasn't
  //     done yet.
  //   We had also set curThread to NULL in ThreadList::init().  This also
  //     makes logical sense, but only because a call to fork() allows
  //     only the calling thread (caller of ThreadList::init()) to live on.
  //     So, that solution seems less general.  So, we'll handle it here, too:
  curThread = th;

  tid = curThread->tid;
  JASSERT (tid != 0);

  // First remove duplicate descriptors.
  for (thread = activeThreads; thread != NULL; thread = next_thread) {
    next_thread = thread->next;
    if (thread != curThread && thread->tid == tid) {
      JLOG(DMTCP)("Removing duplicate thread descriptor")
        (thread->tid) (thread->virtual_tid);
      // There will be at most one duplicate descriptor.
      threadIsDead(thread);
      continue;
    }
    // FIXME:  This causes segfault on second restart.  Why?
    // JASSERT(thread != curThread)(thread)
    //   .Text("adding curThread, but it's already on activeThreads");
    /* NOTE:  ST_ZOMBIE is used only for the sake of efficiency.  We
     *   test threads in state ST_ZOMBIE using tgkill to remove them
     *   early (before reaching a checkpoint) so that the
     *   threadrdescriptor list does not grow too long.
     */
    if (thread->state == ST_ZOMBIE) {
      /* if no thread with this tid, then we can remove zombie descriptor */
      if (-1 == THREAD_TGKILL(motherpid, thread->tid, 0)) {
        JLOG(DMTCP)("Killing zombie thread") (thread->tid);
        threadIsDead(thread);
      }
    }
  }

  curThread->next = activeThreads;
  curThread->prev = NULL;
  if (activeThreads != NULL) {
    activeThreads->prev = curThread;
  }
  activeThreads = curThread;

  unlk_threads();
  return;
}

/*****************************************************************************
 *
 *  Thread has exited - move it from activeThreads list to freelist.
 *
 *  threadisdead() used to free() the Thread struct before returning. However,
 *  if we do that while in the middle of a checkpoint, the call to free() might
 *  deadlock in JAllocator. For this reason, we put the to-be-removed threads
 *  on this threads_freelist and call free() only when it is safe to do so.
 *
 *  This has an added benefit of reduced number of calls to malloc() as the
 *  Thread structs in the freelist can be recycled.
 *
 *****************************************************************************/
void ThreadList::threadIsDead (Thread *thread)
{
  JASSERT(thread != NULL);
  JLOG(DMTCP)("Putting thread on freelist") (thread->tid);

  /* Remove thread block from 'threads' list */
  if (thread->prev != NULL) {
    thread->prev->next = thread->next;
  }
  if (thread->next != NULL) {
    thread->next->prev = thread->prev;
  }
  if (thread == activeThreads) {
    activeThreads = activeThreads->next;
  }

  thread->next = threads_freelist;
  threads_freelist = thread;
}

/*****************************************************************************
 *
 * Return thread from freelist.
 *
 *****************************************************************************/
Thread *ThreadList::getNewThread()
{
  Thread *thread;

  lock_threads();
  if (threads_freelist == NULL) {
    thread = (Thread*) JALLOC_HELPER_MALLOC(sizeof(Thread));
    JASSERT(thread != NULL);
  } else {
    thread = threads_freelist;
    threads_freelist = threads_freelist->next;
  }
  unlk_threads();
  memset(thread, 0, sizeof (*thread));
  return thread;
}

/*****************************************************************************
 *
 * Call free() on all threads_freelist items
 *
 *****************************************************************************/
void ThreadList::emptyFreeList()
{
  lock_threads();

  while (threads_freelist != NULL) {
    Thread *thread = threads_freelist;
    threads_freelist = threads_freelist->next;
    JALLOC_HELPER_FREE(thread);
  }

  unlk_threads();
}