File: control.c

package info (click to toggle)
systemtap 5.1-5
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 47,964 kB
  • sloc: cpp: 80,838; ansic: 54,757; xml: 49,725; exp: 43,665; sh: 11,527; python: 5,003; perl: 2,252; tcl: 1,312; makefile: 1,006; javascript: 149; lisp: 105; awk: 101; asm: 91; java: 70; sed: 16
file content (885 lines) | stat: -rw-r--r-- 27,529 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
/* -*- linux-c -*-
 *
 * control channel
 * Copyright (C) 2007-2018 Red Hat Inc.
 *
 * This file is part of systemtap, and is free software.  You can
 * redistribute it and/or modify it under the terms of the GNU General
 * Public License (GPL); either version 2, or (at your option) any
 * later version.
 */

#include "control.h"
#include "../mempool.c"
#include "symbols.c"
#include <linux/delay.h>
#include <linux/poll.h>
#include "../uidgid_compatibility.h"

static _stp_mempool_t *_stp_pool_q;
static struct list_head _stp_ctl_ready_q;
static STP_DEFINE_SPINLOCK(_stp_ctl_ready_lock);
static STP_DEFINE_SPINLOCK(_stp_ctl_special_msg_lock);

static void _stp_cleanup_and_exit(int send_exit);
static void _stp_handle_tzinfo (struct _stp_msg_tzinfo* tzi);
static void _stp_handle_privilege_credentials (struct _stp_msg_privilege_credentials* pc);
static void _stp_handle_remote_id (struct _stp_msg_remote_id* rem);
static void _stp_handle_namespaces_pid (struct _stp_msg_ns_pid *nspid);
static void _stp_handle_mnt_ns_fds (struct _stp_msg_mnt_ns_fds *nsfds);


static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        static DEFINE_MUTEX(cmd_mutex);
	u32 type;
        int rc = 0;

#ifdef STAPCONF_TASK_UID
	uid_t euid = current->euid;
#else
#if defined(CONFIG_USER_NS) || (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0))
	uid_t euid = from_kuid_munged(current_user_ns(), current_euid());
#else
	uid_t euid = current_euid();
#endif
#endif

	_stp_pid = current->pid;

	if (count < sizeof(u32))
		return 0;

	if (get_user(type, (u32 __user *)buf))
		return -EFAULT;

	count -= sizeof(u32);
	buf += sizeof(u32);

#if defined(DEBUG_TRANS) && (DEBUG_TRANS >= 2)
	if (type < STP_MAX_CMD)
		dbug_trans2("Got %s. euid=%ld, len=%d\n", _stp_command_name[min(type, (u32)STP_MAX_CMD)] ?: "?",
			    (long)euid, (int)count);
#endif

        // PR17232: preclude reentrancy during handling of messages.
        // This also permits use of static variables in the switch/case.
	might_sleep();
        mutex_lock (& cmd_mutex);
        // NB: past this point, no 'return;' - use 'goto out;'

	switch (type) {
	case STP_START:
        {
                static struct _stp_msg_start st;
                if (count < sizeof(st)) {
                        rc = 0; // ?
                        goto out;
                }
                if (copy_from_user(&st, buf, sizeof(st))) {
                        rc = -EFAULT;
                        goto out;
                }
                _stp_handle_start(&st);
        }
        break;

	case STP_EXIT:
                _stp_cleanup_and_exit(1);
		break;

	case STP_BULK:
                // NB: this signals the runtime to save separate
                // per-cpu files; our kernel->userspace files are now
                // always bulkmode (trace$N files).
#ifdef STP_BULKMODE
                // no action needed
                break;
#else
		rc = -EINVAL;
                goto out;
#endif

	case STP_RELOCATION:
		if (euid != 0) {
                        rc = -EPERM;
                        goto out;
                }
                /* This message is too large to copy here.
                   Further error checking is within the
                   function, but XXX no rc is passed back. */
          	_stp_do_relocation (buf, count);
          	break;

        case STP_TZINFO:
        {
                /* NB PR13445: We use a static struct here to contain
                   the kernel-side copy of the user-space message.
                   This should be suspicious (due to concurrency
                   concerns), but actually it's OK.  The main reason
                   is that _stp_ctl_open_cmd() enforces only a single
                   open() at a time on the .ctl file, and staprun
                   (euid=0) isn't multithreaded, and doesn't pass this
                   filehandle anywhere. */
                static struct _stp_msg_tzinfo tzi;
		if (euid != 0) {
                        rc = -EPERM;
                        goto out;
                }
                if (count < sizeof(tzi)) {
                        rc = 0;
                        goto out;
                }
                if (copy_from_user(&tzi, buf, sizeof(tzi))) {
                        rc = -EFAULT;
                        goto out;
                }
                _stp_handle_tzinfo(&tzi);
        }
        break;

        case STP_PRIVILEGE_CREDENTIALS:
        {
                /* NB PR13445: as above. */
                static struct _stp_msg_privilege_credentials pc;
		if (euid != 0) {
                        rc = -EPERM;
                        goto out;
                }
                if (count < sizeof(pc)) {
                        rc = 0;
                        goto out;
                }
                if (copy_from_user(&pc, buf, sizeof(pc))) {
                        rc = -EFAULT;
                        goto out;
                }
                _stp_handle_privilege_credentials(&pc);
        }
        break;

        case STP_REMOTE_ID:
        {
                /* NB PR13445: as above. */
                static struct _stp_msg_remote_id rem;
		if (euid != 0) {
                        rc = -EPERM;
                        goto out;
                }
                if (count < sizeof(rem)) {
                        rc = 0;
                        goto out;
                }
                if (copy_from_user(&rem, buf, sizeof(rem))) {
                        rc = -EFAULT;
                        goto out;
                }
                _stp_handle_remote_id(&rem);
        }
        break;

	case STP_READY:
		break;
  case STP_NAMESPACES_PID:
    {
    static struct _stp_msg_ns_pid nspid;
                if (count < sizeof(nspid)) {
                        rc = 0;
                        goto out;
                }
                if (copy_from_user(&nspid, buf, sizeof(nspid))) {
                        rc = -EFAULT;
                        goto out;
                }

    _stp_handle_namespaces_pid(&nspid);
    }
    break;

        case STP_MNT_NS_FDS:
        {
		static struct _stp_msg_mnt_ns_fds nsfds;
		if (count < sizeof(nsfds)) {
			rc = 0;
			goto out;
		}
		if (copy_from_user(&nsfds, buf, sizeof(nsfds))) {
			rc = -EFAULT;
			goto out;
		}

		_stp_handle_mnt_ns_fds(&nsfds);
        }
        break;

	default:
#ifdef DEBUG_TRANS
		dbug_trans2("invalid command type %d\n", type);
#endif
		rc = -EINVAL;
                goto out;
	}

        // fall through
	rc = count + sizeof(u32); /* Pretend that we absorbed the entire message. */

out:
        mutex_unlock (& cmd_mutex);

#if defined(DEBUG_TRANS) && (DEBUG_TRANS >= 2)
	if (type < STP_MAX_CMD)
		dbug_trans2("Completed %s (rc=%d)\n",
                            _stp_command_name[min(type, (u32)STP_MAX_CMD)] ?: "?",
                            rc);
#endif
        return rc;
}

static DECLARE_WAIT_QUEUE_HEAD(_stp_ctl_wq);

#ifdef DEBUG_TRANS
static void _stp_ctl_write_dbug(int type, void *data, int len)
{
	char buf[64];
	switch (type) {
	case STP_START:
		dbug_trans2("sending STP_START\n");
		break;
	case STP_EXIT:
		dbug_trans2("sending STP_EXIT\n");
		break;
	case STP_OOB_DATA:
		snprintf(buf, sizeof(buf), "%s", (char *)data);
		dbug_trans2("sending %d bytes of STP_OOB_DATA: %s\n", len,
			    buf);
		break;
	case STP_SYSTEM:
		snprintf(buf, sizeof(buf), "%s", (char *)data);
		dbug_trans2("sending STP_SYSTEM: %s\n", buf);
		break;
	case STP_TRANSPORT:
		dbug_trans2("sending STP_TRANSPORT\n");
		break;
	case STP_CONNECT:
		dbug_trans2("sending STP_CONNECT\n");
		break;
	case STP_DISCONNECT:
		dbug_trans2("sending STP_DISCONNECT\n");
		break;
	case STP_BULK:
		dbug_trans2("sending STP_BULK\n");
		break;
	case STP_READY:
	case STP_RELOCATION:
	case STP_BUF_INFO:
	case STP_SUBBUFS_CONSUMED:
		dbug_trans2("sending old message\n");
		break;
	case STP_REALTIME_DATA:
		dbug_trans2("sending %d bytes of STP_REALTIME_DATA\n", len);
		break;
	case STP_REQUEST_EXIT:
		dbug_trans2("sending STP_REQUEST_EXIT\n");
		break;
	default:
		dbug_trans2("ERROR: unknown message type: %d\n", type);
		break;
	}
}
#endif

/* Marker to show a "special" message buffer isn't being used.
   Will be put in the _stp_buffer type field.  The type field Should
   only be manipulated while holding the _stp_ctl_special_msg_lock.  */
#define _STP_CTL_MSG_UNUSED STP_MAX_CMD

/* cmd messages allocated ahead of time.  There can be only one.  */
static struct _stp_buffer *_stp_ctl_start_msg;
static struct _stp_buffer *_stp_ctl_exit_msg;
static struct _stp_buffer *_stp_ctl_transport_msg;
static struct _stp_buffer *_stp_ctl_request_exit_msg;

/* generic overflow messages allocated ahread of time.  */
static struct _stp_buffer *_stp_ctl_oob_warn;
static struct _stp_buffer *_stp_ctl_oob_err;
static struct _stp_buffer *_stp_ctl_system_warn;
static struct _stp_buffer *_stp_ctl_realtime_err;

/* Set aside buffers for all "special" message types, plus generic
   warning and error messages.  */
static int _stp_ctl_alloc_special_buffers(void)
{
	size_t len;
	const char *msg;

	/* There can be only one of start, exit, transport and request.  */
	_stp_ctl_start_msg = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_start_msg == NULL)
		return -1;
	_stp_ctl_start_msg->type = _STP_CTL_MSG_UNUSED;

	_stp_ctl_exit_msg = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_exit_msg == NULL)
		return -1;
	_stp_ctl_exit_msg->type = _STP_CTL_MSG_UNUSED;

	_stp_ctl_transport_msg = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_transport_msg == NULL)
		return -1;
	_stp_ctl_transport_msg->type = _STP_CTL_MSG_UNUSED;

	_stp_ctl_request_exit_msg = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_request_exit_msg == NULL)
		return -1;
	_stp_ctl_request_exit_msg->type = _STP_CTL_MSG_UNUSED;

	/* oob_warn, oob_err, system and realtime are dynamically
	   allocated and a special static warn/err message take their
	   place if we run out of memory before delivery.  */
	_stp_ctl_oob_warn = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_oob_warn == NULL)
		return -1;
	_stp_ctl_oob_warn->type = _STP_CTL_MSG_UNUSED;
	/* Note that the following message shouldn't be translated,
	 * since "WARNING:" is part of the module cmd protocol. */
	msg = "WARNING: too many pending (warning) messages\n";
	len = strlen(msg) + 1;
	_stp_ctl_oob_warn->len = len;
	memcpy(&_stp_ctl_oob_warn->buf, msg, len);

	_stp_ctl_oob_err = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_oob_err == NULL)
		return -1;
	_stp_ctl_oob_err->type = _STP_CTL_MSG_UNUSED;
	/* Note that the following message shouldn't be translated,
	 * since "ERROR:" is part of the module cmd protocol. */
	msg = "ERROR: too many pending (error) messages\n";
	len = strlen(msg) + 1;
	_stp_ctl_oob_err->len = len;
	memcpy(&_stp_ctl_oob_err->buf, msg, len);

	_stp_ctl_system_warn = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_system_warn == NULL)
		return -1;
	_stp_ctl_system_warn->type = _STP_CTL_MSG_UNUSED;
	/* Note that the following message shouldn't be translated,
	 * since "WARNING:" is part of the module cmd protocol. */
	msg = "WARNING: too many pending (system) messages\n";
	len = strlen(msg) + 1;
	_stp_ctl_system_warn->len = len;
	memcpy(&_stp_ctl_system_warn->buf, msg, len);

	_stp_ctl_realtime_err = _stp_mempool_alloc(_stp_pool_q);
	if (_stp_ctl_realtime_err == NULL)
		return -1;
	_stp_ctl_realtime_err->type = _STP_CTL_MSG_UNUSED;
	/* Note that the following message shouldn't be translated,
	 * since "ERROR:" is part of the module cmd protocol. */
	msg = "ERROR: too many pending (realtime) messages\n";
	len = strlen(msg) + 1;
	_stp_ctl_realtime_err->len = len;
	memcpy(&_stp_ctl_realtime_err->buf, msg, len);

	return 0;
}


/* Free the buffers for all "special" message types, plus generic
   warning and error messages.  */
static void _stp_ctl_free_special_buffers(void)
{
	if (_stp_ctl_start_msg != NULL) {
		_stp_mempool_free(_stp_ctl_start_msg);
		_stp_ctl_start_msg = NULL;
	}

	if (_stp_ctl_exit_msg != NULL) {
		_stp_mempool_free(_stp_ctl_exit_msg);
		_stp_ctl_exit_msg = NULL;
	}

	if (_stp_ctl_transport_msg != NULL) {
		_stp_mempool_free(_stp_ctl_transport_msg);
		_stp_ctl_transport_msg = NULL;
	}

	if (_stp_ctl_request_exit_msg != NULL) {
		_stp_mempool_free(_stp_ctl_request_exit_msg);
		_stp_ctl_request_exit_msg = NULL;
	}

	if (_stp_ctl_oob_warn != NULL) {
		_stp_mempool_free(_stp_ctl_oob_warn);
		_stp_ctl_oob_warn = NULL;
	}

	if (_stp_ctl_oob_err != NULL) {
		_stp_mempool_free(_stp_ctl_oob_err);
		_stp_ctl_oob_err = NULL;
	}

	if (_stp_ctl_system_warn != NULL) {
		_stp_mempool_free(_stp_ctl_system_warn);
		_stp_ctl_system_warn = NULL;
	}

	if (_stp_ctl_realtime_err != NULL) {
		_stp_mempool_free(_stp_ctl_realtime_err);
		_stp_ctl_realtime_err = NULL;
	}
}


/* Get a buffer based on type, possibly a generic buffer, when all else
   fails returns NULL and there is nothing we can do.  */
static struct _stp_buffer *_stp_ctl_get_buffer(int type, const char *data,
					       unsigned len)
{
	unsigned long flags = 0;
	struct _stp_buffer *bptr = NULL;

	/* Is it a dynamically allocated message type? */
	if (type == STP_OOB_DATA
	    || type == STP_SYSTEM
	    || type == STP_REALTIME_DATA)
		bptr = _stp_mempool_alloc(_stp_pool_q);

	if (bptr != NULL) {
		bptr->type = type;
		memcpy(bptr->buf, data, len);
		bptr->len = len;
	} else {
		/* "special" type, or no more dynamic buffers.
		   We must be careful to lock to avoid races between
		   marking as used/free.  There can be only one.  */
		switch (type) {
		case STP_START:
			bptr = _stp_ctl_start_msg;
			break;
		case STP_EXIT:
			bptr = _stp_ctl_exit_msg;
			break;
    case STP_NAMESPACES_PID:
      break;
		case STP_TRANSPORT:
			bptr = _stp_ctl_transport_msg;
			break;
		case STP_REQUEST_EXIT:
			bptr = _stp_ctl_request_exit_msg;
			break;
		case STP_OOB_DATA:
			/* Note that "WARNING:" should not be
			 * translated, since it is part of the module
			 * cmd protocol. */
			if (data && len >= 9
			    && strncmp(data, "WARNING: ", 9) == 0)
				bptr = _stp_ctl_oob_warn;
			/* Note that "ERROR:" should not be
			 * translated, since it is part of the module
			 * cmd protocol. */
			else if (data && len >= 7
				 && strncmp(data, "ERROR: ", 7) == 0)
				bptr = _stp_ctl_oob_err;
			else
				printk(KERN_WARNING "_stp_ctl_get_buffer unexpected STP_OOB_DATA\n");
			break;
		case STP_SYSTEM:
			bptr = _stp_ctl_system_warn;
			type = STP_OOB_DATA; /* overflow message */
			break;
		case STP_REALTIME_DATA:
			bptr = _stp_ctl_realtime_err;
			type = STP_OOB_DATA; /* overflow message */
			break;
		default:
			printk(KERN_WARNING "_stp_ctl_get_buffer unknown type: %d\n", type);
			bptr = NULL;
			break;
		}
		if (bptr != NULL) {
			/* OK, it is a special one, but is it free?  */
			stp_nmi_spin_lock_irqsave(&_stp_ctl_special_msg_lock, flags, failed);
			if (bptr->type == _STP_CTL_MSG_UNUSED)
				bptr->type = type;
			else
				bptr = NULL;
			stp_nmi_spin_unlock_irqrestore(&_stp_ctl_special_msg_lock, flags);
		}

		/* Got a special message buffer, with type set, fill it in,
		   unless it is an "overflow" message.  */
		if (bptr != NULL
		    && bptr != _stp_ctl_oob_warn
		    && bptr != _stp_ctl_oob_err
		    && bptr != _stp_ctl_system_warn
		    && bptr != _stp_ctl_realtime_err) {
			memcpy(bptr->buf, data, len);
			bptr->len = len;
		}
	}
	return bptr;

failed:
	return NULL;
}

/* Returns the given buffer to the pool when dynamically allocated.
   Marks special buffers as being unused.  */
static void _stp_ctl_free_buffer(struct _stp_buffer *bptr)
{
	unsigned long flags;

	/* Special buffers need special care and locking.  */
	if (bptr == _stp_ctl_start_msg
	    || bptr == _stp_ctl_exit_msg
	    || bptr == _stp_ctl_transport_msg
	    || bptr == _stp_ctl_request_exit_msg
	    || bptr == _stp_ctl_oob_warn
	    || bptr == _stp_ctl_oob_err
	    || bptr == _stp_ctl_system_warn
	    || bptr == _stp_ctl_realtime_err) {
		stp_spin_lock_irqsave(&_stp_ctl_special_msg_lock, flags);
		bptr->type = _STP_CTL_MSG_UNUSED;
		stp_spin_unlock_irqrestore(&_stp_ctl_special_msg_lock, flags);
	} else {
		_stp_mempool_free(bptr);
	}
}

/* Put a message on the _stp_ctl_ready_q.  Safe to call from a probe context.
   Doesn't call wake_up on _stp_ctl_wq (which would not be safe from all
   probe context). A timer will come by and pick up the message to notify
   any readers. Returns the number of bytes queued/send or zero/negative
   on error. */
static int _stp_ctl_send(int type, void *data, unsigned len)
{
	struct context* __restrict__ c = NULL;
	struct _stp_buffer *bptr;
	unsigned long flags = 0;
	unsigned hlen;

#ifdef DEBUG_TRANS
	_stp_ctl_write_dbug(type, data, len);
#endif

	/* Give the fs a chance to do something special.
	   Like merging two packets in case the previous buffer
	   still has some room (transport version 1 procfs does  this. */
	hlen = _stp_ctl_write_fs(type, data, len);
	if (hlen > 0)
		return hlen;

	/* make sure we won't overflow the buffer */
	if (unlikely(len > STP_CTL_BUFFER_SIZE)) {
                _stp_warn("runtime control message type=%d len=%d too large\n", type, len);
		return 0;
	}

	/* Prevent probe reentrancy while grabbing probe-used locks.
	   Since _stp_ctl_send may be called from arbitrary probe context, we
	   have to make sure that all locks it wants can't possibly be held
	   outside probe context too.  This includes:
	    * _stp_ctl_ready_lock
	    * _stp_pool_q->lock
	    * _stp_ctl_special_msg_lock
	   We ensure this by grabbing the context here and everywhere else that
	   uses those locks, so such a probe will appear reentrant and be
	   skipped rather than deadlock.  */
	c = _stp_runtime_entryfn_get_context();

	stp_nmi_spin_lock_irqsave(&_stp_ctl_ready_lock, flags, no_lock);

	/* get a buffer from the free pool */
	bptr = _stp_ctl_get_buffer(type, data, len);
	if (unlikely(bptr == NULL)) {
		/* Nothing else we can do... but let's not spam the kernel
                   with these reports. */
                /* printk(KERN_ERR "ctl_write_msg type=%d len=%d ENOMEM\n", type, len); */
		goto no_mem;
	}

	/* Put it on the pool of ready buffers.  It's possible to recursively
	   hit a probe here, like a kprobe in NMI or the lock tracepoints, but
	   they will be squashed since we're holding the context busy.  */
	list_add_tail(&bptr->list, &_stp_ctl_ready_q);

	stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);

	_stp_runtime_entryfn_put_context(c);

	/* It would be nice if we could speed up the notification
	   timer at this point, but calling mod_timer() at this
	   point would bring in more locking issues... */
	return len + sizeof(bptr->type);

no_lock:
	_stp_runtime_entryfn_put_context(c);
	return -EBUSY;

no_mem:
	stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
	_stp_runtime_entryfn_put_context(c);
	return -ENOMEM;
}

/* Logs a warning or error through the control channel. This function mimics
   _stp_ctl_send() but directly uses an _stp_buffer to construct the warning or
   error message. This is *only* for warnings and errors. The logtype string
   should be either "WARNING: " or "ERROR: ", and logtype_len shouldn't include
   a trailing NUL termination byte. The message type is always assumed to be
   STP_OOB_DATA since this is only for warnings and errors. */
static void _stp_ctl_log_werr(const char *logtype, size_t logtype_len,
			      const char *fmt, va_list args)
{
	struct context *__restrict__ c;
	struct _stp_buffer *bptr;
	unsigned long flags = 0;

	c = _stp_runtime_entryfn_get_context();

	stp_nmi_spin_lock_irqsave(&_stp_ctl_ready_lock, flags, put_context);

	bptr = _stp_ctl_get_buffer(STP_OOB_DATA, logtype, logtype_len);
	if (!bptr)
		goto unlock;

	/*
	 * This is a generic failure message for when there's no space left. We
	 * aren't allowed to change it, so just go straight to sending it off.
	 */
	if (bptr == _stp_ctl_oob_warn || bptr == _stp_ctl_oob_err)
		goto send_msg;

	/*
	 * The logtype string was already copied in by _stp_ctl_get_buffer(),
	 * now copy the rest of the message. The trailing NUL termination byte
	 * automatically added by vscnprintf() is unneeded, so it's ignored.
	 */
	bptr->len += vscnprintf(bptr->buf + logtype_len,
				STP_CTL_BUFFER_SIZE - logtype_len, fmt, args);

	/*
	 * Make sure the last character is a newline. There will always be
	 * enough space to do this because vscnprintf() reserves a byte for the
	 * trailing NUL character which we don't care about.
	 */
	if (bptr->buf[bptr->len - 1] != '\n')
		bptr->buf[bptr->len++] = '\n';

send_msg:
	list_add_tail(&bptr->list, &_stp_ctl_ready_q);

unlock:
	stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);

put_context:
	_stp_runtime_entryfn_put_context(c);
}

/* Calls _stp_ctl_send and then calls wake_up on _stp_ctl_wq
   to immediately notify listeners. DO NOT CALL THIS FROM A (KERNEL)
   PROBE CONTEXT. This is only safe to call from the transport layer
   itself when in user context. All code that could be triggered from
   a probe context should call _stp_ctl_send(). */
static int _stp_ctl_send_notify(int type, void *data, unsigned len)
{
	int ret;
	dbug_trans(1, "_stp_ctl_send_notify: type=%d len=%d\n", type, len);
	ret = _stp_ctl_send(type, data, len);

	/* A message was queued, so wake up all _stp_ctl_wq listeners
	   so stapio can pick it up asap.  */
	if (ret > 0)
		wake_up_interruptible(&_stp_ctl_wq);

	return ret;
}

/** Called when someone tries to read from our .cmd file.
    Will take _stp_ctl_ready_lock and pick off the next _stp_buffer
    from the _stp_ctl_ready_q, will wait_event on _stp_ctl_wq.  */
static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf,
				 size_t count, loff_t *ppos)
{
	struct context* __restrict__ c = NULL;
	struct _stp_buffer *bptr;
	int len;
	unsigned long flags;

	/* Prevent probe reentrancy while grabbing probe-used locks.  */
	c = _stp_runtime_entryfn_get_context();

	/* wait for nonempty ready queue */
	stp_spin_lock_irqsave(&_stp_ctl_ready_lock, flags);
	while (list_empty(&_stp_ctl_ready_q)) {
		stp_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
		_stp_runtime_entryfn_put_context(c);
		if (file->f_flags & O_NONBLOCK)
			return -EAGAIN;
		if (wait_event_interruptible(_stp_ctl_wq, !list_empty(&_stp_ctl_ready_q)))
			return -ERESTARTSYS;
		c = _stp_runtime_entryfn_get_context();
		stp_spin_lock_irqsave(&_stp_ctl_ready_lock, flags);
	}

	/* get the next buffer off the ready list */
	bptr = (struct _stp_buffer *)_stp_ctl_ready_q.next;
	list_del_init(&bptr->list);
	stp_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);

	/* NB: we can't hold the context across copy_to_user, as it might fault.  */
	_stp_runtime_entryfn_put_context(c);

	/* write it out */
	len = bptr->len + 4;
	if (len > count || copy_to_user(buf, &bptr->type, len)) {
		/* Now what?  We took it off the queue then failed to
		 * send it.  We can't put it back on the queue because
		 * it will likely be out-of-order.  Fortunately, this
		 * should never happen.
		 *
		 * FIXME: need to mark this as a transport failure. */
		errk("Supplied buffer too small. count:%d len:%d\n", (int)count, len);
		return -EFAULT;
	}

	/* put it on the pool of free buffers */
	c = _stp_runtime_entryfn_get_context();
	_stp_ctl_free_buffer(bptr);
	_stp_runtime_entryfn_put_context(c);

	return len;
}

static int _stp_ctl_open_cmd(struct inode *inode, struct file *file)
{
	static struct file_operations _stp_ctl_fops;

	if (atomic_inc_return (&_stp_ctl_attached) > 1) {
                atomic_dec (&_stp_ctl_attached);
		return -EBUSY;
        }

	/*
	 * Replace the file's f_op with our own which has the module owner set.
	 * This is needed because, in do_select(), the only thing that can stop
	 * this module from disappearing while data from our procfs file is in
	 * use is the module reference counter. So we need to set the module
	 * owner pointer and then add a reference to our module, since the
	 * reference addition from the open() has already been skipped by the
	 * time this code is reached. The data which can be used after the
	 * module is freed is `&_stp_ctl_wq`, which is stored and later
	 * dereferenced in do_select(). This pointer is passed to do_select()
	 * from the poll_wait() in _stp_ctl_poll_cmd(), which stores it in
	 * `entry->wait_address`. The reason this use-after-free problem exists
	 * is because procfs doesn't allow for passing in a module owner: all
	 * procfs files use an internal `struct file_operations` located in
	 * fs/proc/inode.c. So we patch in a module owner the hard way. No
	 * locking is needed here due to the `_stp_ctl_attached` guard above.
	 * Note that `_stp_ctl_fops` can only be initialized once; initializing
	 * it more than once could cause a bad race because _stp_ctl_close_cmd()
	 * is called *before* the final `file->f_op` usage, meaning that the
	 * `_stp_ctl_attached` guard won't stop us from mangling `_stp_ctl_fops`
	 * while it's in use for closing an old control channel fd.
	 */
	if (_stp_ctl_fops.owner != THIS_MODULE) {
		_stp_ctl_fops = *file->f_op;
		_stp_ctl_fops.owner = THIS_MODULE;
	}
	__module_get(THIS_MODULE);
	file->f_op = &_stp_ctl_fops;

	_stp_attach();
	return 0;
}

static int _stp_ctl_close_cmd(struct inode *inode, struct file *file)
{
        if (atomic_dec_return (&_stp_ctl_attached) > 0) {
                BUG();
                return -EINVAL;
        }
        _stp_detach();
	return 0;
}

static unsigned _stp_ctl_poll_cmd(struct file *file, poll_table *wait)
{
	/* Pretend we can always write and that there is
	   priority data available.  We do this so select
	   will report an exception condition on the file,
	   which is used by stapio to see whether select
	   works. */
	unsigned res = POLLPRI | POLLOUT | POLLWRNORM;
	unsigned long flags;

        poll_wait(file, &_stp_ctl_wq, wait);

        /* If there are messages waiting, then there will be
	   data available to read. */
	stp_spin_lock_irqsave(&_stp_ctl_ready_lock, flags);
	if (! list_empty(&_stp_ctl_ready_q))
		res |= POLLIN | POLLRDNORM;
	stp_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);

	return res;
}


static struct file_operations _stp_ctl_fops_cmd = {
	.owner = THIS_MODULE,
	.read = _stp_ctl_read_cmd,
	.write = _stp_ctl_write_cmd,
	.open = _stp_ctl_open_cmd,
	.release = _stp_ctl_close_cmd,
	.poll = _stp_ctl_poll_cmd
};
#ifdef STAPCONF_PROC_OPS
static struct proc_ops _stp_ctl_proc_ops_cmd = {
	.proc_read = _stp_ctl_read_cmd,
	.proc_write = _stp_ctl_write_cmd,
	.proc_open = _stp_ctl_open_cmd,
	.proc_release = _stp_ctl_close_cmd,
	.proc_poll = _stp_ctl_poll_cmd
};
#endif

static int _stp_register_ctl_channel(void)
{
	INIT_LIST_HEAD(&_stp_ctl_ready_q);

	/* allocate buffers */
	_stp_pool_q = _stp_mempool_init(sizeof(struct _stp_buffer),
					STP_DEFAULT_BUFFERS);
	if (unlikely(_stp_pool_q == NULL))
		goto err0;
	_stp_allocated_net_memory += sizeof(struct _stp_buffer) * STP_DEFAULT_BUFFERS;

	if (unlikely(_stp_ctl_alloc_special_buffers() != 0))
		goto err0;

	if (_stp_register_ctl_channel_fs() != 0) // procfs or debugfs decision time
		goto err0;

	return 0;

err0:
	_stp_mempool_destroy(_stp_pool_q);
	errk("Error creating systemtap control channel.\n");
	return -1;
}

static void _stp_unregister_ctl_channel(void)
{
	struct _stp_buffer *bptr, *tmp;

	_stp_unregister_ctl_channel_fs();

	/* Return memory to pool and free it. */
	list_for_each_entry_safe(bptr, tmp, &_stp_ctl_ready_q, list) {
		list_del(&bptr->list);
		_stp_ctl_free_buffer(bptr);
	}
	_stp_ctl_free_special_buffers();
	_stp_mempool_destroy(_stp_pool_q);
}