File: GB_subref_template.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (664 lines) | stat: -rw-r--r-- 28,329 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
//------------------------------------------------------------------------------
// GB_subref_template: C = A(I,J)
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// GB_subref_templat extracts a submatrix, C = A(I,J).  The method is done in
// two phases.  Phase 1 just counts the entries in C, and phase 2 constructs
// the pattern and values of C.  There are 3 kinds of subref:
//
//      symbolic:  C(i,j) is the position of A(I(i),J(j)) in the matrix A
//      iso:     C = A(I,J), extracting the pattern only, not the values
//      numeric: C = A(I,J), extracting the pattern and values

#if defined ( GB_SYMBOLIC )

    // symbolic method must tolerate zombies
    #define GB_Ai(p) GBI_UNFLIP (Ai, p, avlen)

#else

    // iso and non-iso numeric methods will not see any zombies
    #define GB_Ai(p) GBI (Ai, p, avlen)

#endif

// to iterate across all entries in a bucket:
#define GB_for_each_index_in_bucket(inew,i)     \
    for (int64_t inew = Mark [i] - 1 ; inew >= 0 ; inew = Inext [inew])

//------------------------------------------------------------------------------

{

    //--------------------------------------------------------------------------
    // get A and I
    //--------------------------------------------------------------------------

    const int64_t *restrict Ai = A->i ;
    const int64_t avlen = A->vlen ;

    // these values are ignored if Ikind == GB_LIST
    int64_t ibegin = Icolon [GxB_BEGIN] ;
    int64_t iinc   = Icolon [GxB_INC  ] ;
    int64_t inc    = (iinc < 0) ? (-iinc) : iinc ;
    #ifdef GB_DEBUG
    int64_t iend   = Icolon [GxB_END  ] ;
    #endif

    //--------------------------------------------------------------------------
    // phase1: count entries in each C(:,kC); phase2: compute C
    //--------------------------------------------------------------------------

    int taskid ;
    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
    for (taskid = 0 ; taskid < ntasks ; taskid++)
    {

        //----------------------------------------------------------------------
        // get the task descriptor
        //----------------------------------------------------------------------

        int64_t kfirst = TaskList [taskid].kfirst ;
        int64_t klast  = TaskList [taskid].klast ;
        bool fine_task = (klast < 0) ;
        if (fine_task)
        {
            // a fine task operates on a slice of a single vector
            klast = kfirst ;
        }

        // a coarse task accesses all of I for all its vectors
        int64_t pI     = 0 ;
        int64_t pI_end = nI ;
        int64_t ilen   = nI ;

        ASSERT (0 <= kfirst && kfirst <= klast && klast < Cnvec) ;

        //----------------------------------------------------------------------
        // compute all vectors C(:,kfirst:klast) for this task
        //----------------------------------------------------------------------

        for (int64_t kC = kfirst ; kC <= klast ; kC++)
        {

            //------------------------------------------------------------------
            // get C(:,kC)
            //------------------------------------------------------------------

            #if defined ( GB_ANALYSIS_PHASE )
            // phase1 simply counts the # of entries in C(*,kC).
            int64_t clen = 0 ;
            #else
            // This task computes all or part of C(:,kC), which are the entries
            // in Ci,Cx [pC:pC_end-1].
            int64_t pC, pC_end ;
            if (fine_task)
            { 
                // A fine task computes a slice of C(:,kC)
                pC     = TaskList [taskid  ].pC ;
                pC_end = TaskList [taskid+1].pC ;
                ASSERT (Cp [kC] <= pC && pC <= pC_end && pC_end <= Cp [kC+1]) ;
            }
            else
            { 
                // The vectors of C are never sliced for a coarse task, so this
                // task computes all of C(:,kC).
                pC     = Cp [kC] ;
                pC_end = Cp [kC+1] ;
            }
            int64_t clen = pC_end - pC ;
            if (clen == 0) continue ;
            #endif

            //------------------------------------------------------------------
            // get A(:,kA)
            //------------------------------------------------------------------

            int64_t pA, pA_end ;

            if (fine_task)
            { 
                // a fine task computes a slice of a single vector C(:,kC).
                // The task accesses Ai,Ax [pA:pA_end-1], which holds either
                // the entire vector A(imin:imax,kA) for method 6, the entire
                // dense A(:,kA) for methods 1 and 2, or a slice of the
                // A(imin:max,kA) vector for all other methods.
                pA     = TaskList [taskid].pA ;
                pA_end = TaskList [taskid].pA_end ;
            }
            else
            { 
                // a coarse task computes the entire vector C(:,kC).  The task
                // accesses all of A(imin:imax,kA), for most methods, or all of
                // A(:,kA) for methods 1 and 2.  The vector A(*,kA) appears in
                // Ai,Ax [pA:pA_end-1].
                pA     = Ap_start [kC] ;
                pA_end = Ap_end   [kC] ;
            }

            int64_t alen = pA_end - pA ;
            if (alen == 0) continue ;

            //------------------------------------------------------------------
            // get I
            //------------------------------------------------------------------

            if (fine_task)
            { 
                // A fine task accesses I [pI:pI_end-1].  For methods 2 and 6,
                // pI:pI_end is a subset of the entire 0:nI-1 list.  For all
                // other methods, pI = 0 and pI_end = nI, and the task can
                // access all of I.
                pI     = TaskList [taskid].pB ;
                pI_end = TaskList [taskid].pB_end ;
                ilen   = pI_end - pI ;
            }

            //------------------------------------------------------------------
            // determine the method to use
            //------------------------------------------------------------------

            int method ;
            if (fine_task)
            { 
                // The method that the fine task uses for its slice of A(*,kA)
                // and C(*,kC) has already been determined by GB_subref_slice.
                method = (int) (-TaskList [taskid].klast) ;
            }
            else
            { 
                // determine the method based on A(*,kA) and I
                method = GB_subref_method (NULL, NULL, alen, avlen, Ikind, nI,
                    (Mark != NULL), need_qsort, iinc, nduplicates) ;
            }

            //------------------------------------------------------------------
            // extract C (:,kC) = A (I,kA): consider all cases
            //------------------------------------------------------------------

            switch (method)
            {

                //--------------------------------------------------------------
                case 1 : // C(:,kC) = A(:,kA) where A(:,kA) is dense
                //--------------------------------------------------------------

                    // A (:,kA) has not been sliced
                    ASSERT (Ikind == GB_ALL) ;
                    ASSERT (pA     == Ap_start [kC]) ;
                    ASSERT (pA_end == Ap_end   [kC]) ;
                    // copy the entire vector and construct indices
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = ilen ;
                    #else
                    for (int64_t k = 0 ; k < ilen ; k++)
                    { 
                        int64_t inew = k + pI ;
                        ASSERT (inew == GB_ijlist (I, inew, Ikind, Icolon)) ;
                        ASSERT (inew == GB_Ai (pA + inew)) ;
                        Ci [pC + k] = inew ;
                    }
                    GB_COPY_RANGE (pC, pA + pI, ilen) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 2 : // C(:,kC) = A(I,kA) where A(I,kA) is dense
                //--------------------------------------------------------------

                    // This method handles any kind of list I, but A(:,kA)
                    // must be dense.  A(:,kA) has not been sliced.
                    ASSERT (pA     == Ap_start [kC]) ;
                    ASSERT (pA_end == Ap_end   [kC]) ;
                    // scan I and get the entry in A(:,kA) via direct lookup
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = ilen ;
                    #else
                    for (int64_t k = 0 ; k < ilen ; k++)
                    { 
                        // C(inew,kC) =  A(i,kA), and it always exists.
                        int64_t inew = k + pI ;
                        int64_t i = GB_ijlist (I, inew, Ikind, Icolon) ;
                        ASSERT (i == GB_Ai (pA + i)) ;
                        Ci [pC + k] = inew ;
                        GB_COPY_ENTRY (pC + k, pA + i) ;
                    }
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 3 : // the list I has a single index, ibegin
                //--------------------------------------------------------------

                    // binary search in GB_subref_phase0 has already found it.
                    // This can be any Ikind with nI=1: GB_ALL with A->vlen=1,
                    // GB_RANGE with ibegin==iend, GB_STRIDE such as 0:-1:0
                    // (with length 1), or a GB_LIST with ni=1.

                    // Time: 50x faster

                    ASSERT (!fine_task) ;
                    ASSERT (alen == 1) ;
                    ASSERT (nI == 1) ;
                    ASSERT (GB_Ai (pA) == GB_ijlist (I, 0, Ikind, Icolon)) ;
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = 1 ;
                    #else
                    Ci [pC] = 0 ;
                    GB_COPY_ENTRY (pC, pA) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 4 : // Ikind is ":", thus C(:,kC) = A (:,kA)
                //--------------------------------------------------------------

                    // Time: 1x faster but low speedup on the Mac.  Why?
                    // Probably memory bound since it is just memcpy's.

                    ASSERT (Ikind == GB_ALL && ibegin == 0) ;
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = alen ;
                    #else
                    #if defined ( GB_SYMBOLIC )
                    if (nzombies == 0)
                    { 
                        memcpy (Ci + pC, Ai + pA, alen * sizeof (int64_t)) ;
                    }
                    else
                    {
                        // with zombies
                        for (int64_t k = 0 ; k < alen ; k++)
                        { 
                            // symbolic C(:,kC) = A(:,kA) where A has zombies
                            int64_t i = GB_Ai (pA + k) ;
                            ASSERT (i == GB_ijlist (I, i, Ikind, Icolon)) ;
                            Ci [pC + k] = i ;
                        }
                    }
                    #else
                    memcpy (Ci + pC, Ai + pA, alen * sizeof (int64_t)) ;
                    #endif
                    GB_COPY_RANGE (pC, pA, alen) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 5 : // Ikind is GB_RANGE = ibegin:iend
                //--------------------------------------------------------------

                    // Time: much faster.  Good speedup too.

                    ASSERT (Ikind == GB_RANGE) ;
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = alen ;
                    #else
                    for (int64_t k = 0 ; k < alen ; k++)
                    { 
                        int64_t i = GB_Ai (pA + k) ;
                        int64_t inew = i - ibegin ;
                        ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
                        Ci [pC + k] = inew ;
                    }
                    GB_COPY_RANGE (pC, pA, alen) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 6 : // I is short vs nnz (A (:,kA)), use binary search
                //--------------------------------------------------------------

                    // Time: very slow unless I is very short and A(:,kA) is
                    // very long.

                    // This case can handle any kind of I, and A(:,kA) of any
                    // properties.  For a fine task, A(:,kA) has not been
                    // sliced; I has been sliced instead.

                    // If the I bucket inverse has not been created, this
                    // method is the only option.  Alternatively, if nI =
                    // length (I) is << nnz (A (:,kA)), then scanning I and
                    // doing a binary search of A (:,kA) is faster than doing a
                    // linear-time search of A(:,kA) and a lookup into the I
                    // bucket inverse.

                    // The vector of C is constructed in sorted order, so no
                    // sort is needed.

                    // A(:,kA) has not been sliced.
                    ASSERT (pA     == Ap_start [kC]) ;
                    ASSERT (pA_end == Ap_end   [kC]) ;

                    // scan I, in order, and search for the entry in A(:,kA)
                    for (int64_t k = 0 ; k < ilen ; k++)
                    {
                        // C(inew,kC) = A (i,kA), if it exists.
                        // i = I [inew] ; or from a colon expression
                        int64_t inew = k + pI ;
                        int64_t i = GB_ijlist (I, inew, Ikind, Icolon) ;
                        bool found ;
                        int64_t pleft = pA ;
                        int64_t pright = pA_end - 1 ;
                        #if defined ( GB_SYMBOLIC )
                        bool is_zombie ;
                        GB_BINARY_SEARCH_ZOMBIE (i, Ai, pleft, pright, found,
                            nzombies, is_zombie) ;
                        #else
                        GB_BINARY_SEARCH (i, Ai, pleft, pright, found) ;
                        #endif
                        if (found)
                        { 
                            ASSERT (i == GB_Ai (pleft)) ;
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            ASSERT (pC < pC_end) ;
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pleft) ;
                            pC++ ;
                            #endif
                        }
                    }
                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 7 : // I is ibegin:iinc:iend with iinc > 1
                //--------------------------------------------------------------

                    // Time: 1 thread: C=A(1:2:n,:) is 3x slower
                    // but has good speedup.  About as fast with
                    // enough threads.

                    ASSERT (Ikind == GB_STRIDE && iinc > 1) ;
                    for (int64_t k = 0 ; k < alen ; k++)
                    {
                        // A(i,kA) present; see if it is in ibegin:iinc:iend
                        int64_t i = GB_Ai (pA + k) ;
                        ASSERT (ibegin <= i && i <= iend) ;
                        i = i - ibegin ;
                        if (i % iinc == 0)
                        { 
                            // i is in the sequence ibegin:iinc:iend
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            int64_t inew = i / iinc ;
                            ASSERT (pC < pC_end) ;
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pA + k) ;
                            pC++ ;
                            #endif
                        }
                    }
                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    #endif
                    break ;

                //----------------------------------------------------------
                case 8 : // I = ibegin:(-iinc):iend, with iinc < -1
                //----------------------------------------------------------

                    // Time: 2x slower for iinc = -2 or -8.
                    // Good speedup though.  Faster for
                    // large values (iinc = -128).
                
                    ASSERT (Ikind == GB_STRIDE && iinc < -1) ;
                    for (int64_t k = alen - 1 ; k >= 0 ; k--)
                    {
                        // A(i,kA) present; see if it is in ibegin:iinc:iend
                        int64_t i = GB_Ai (pA + k) ;
                        ASSERT (iend <= i && i <= ibegin) ;
                        i = ibegin - i ;
                        if (i % inc == 0)
                        { 
                            // i is in the sequence ibegin:iinc:iend
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            int64_t inew = i / inc ;
                            ASSERT (pC < pC_end) ;
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pA + k) ;
                            pC++ ;
                            #endif
                        }
                    }
                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    #endif
                    break ;

                //----------------------------------------------------------
                case 9 : // I = ibegin:(-1):iend
                //----------------------------------------------------------

                    // Time: much faster.  Good speedup.

                    ASSERT (Ikind == GB_STRIDE && iinc == -1) ;
                    #if defined ( GB_ANALYSIS_PHASE )
                    clen = alen ;
                    #else
                    for (int64_t k = alen - 1 ; k >= 0 ; k--)
                    { 
                        // A(i,kA) is present
                        int64_t i = GB_Ai (pA + k) ;
                        int64_t inew = (ibegin - i) ;
                        ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
                        Ci [pC] = inew ;
                        GB_COPY_ENTRY (pC, pA + k) ;
                        pC++ ;
                    }
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 10 : // I unsorted, and C needs qsort, duplicates OK
                //--------------------------------------------------------------

                    // Time: with one thread: 2x slower, probably
                    // because of the qsort.  Good speedup however.  This used
                    // if qsort is needed but ndupl == 0.  Try a method that
                    // needs qsort, but no duplicates?

                    // Case 10 works well when I has many entries and A(:,kA)
                    // has few entries. C(:,kC) must be sorted after this pass.

                    ASSERT (Ikind == GB_LIST) ;
                    for (int64_t k = 0 ; k < alen ; k++)
                    {
                        // A(i,kA) present, look it up in the I inverse buckets
                        int64_t i = GB_Ai (pA + k) ;
                        // traverse bucket i for all indices inew where
                        // i == I [inew] or where i is from a colon expression
                        GB_for_each_index_in_bucket (inew, i)
                        { 
                            ASSERT (inew >= 0 && inew < nI) ;
                            ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pA + k) ;
                            pC++ ;
                            #endif
                        }
                    }

                    // TODO: skip the sort if C is allowed to be jumbled on
                    // output.  Flag C as jumbled instead.

                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    if (!fine_task)
                    { 
                        // a coarse task owns this entire C(:,kC) vector, so
                        // the sort can be done now.  The sort for vectors
                        // handled by multiple fine tasks must wait until all
                        // task are completed, below in the post sort.
                        pC = Cp [kC] ;

                        #if defined ( GB_ISO_SUBREF )
                        // iso numeric subref C=A(I,J)
                        // just sort the pattern of C(:,kC)
                        GB_qsort_1 (Ci + pC, clen) ;
                        #else
                        // sort the pattern of C(:,kC), and the values
                        GB_qsort_1b (Ci + pC, (GB_void *) (Cx + pC*GB_CSIZE1),
                            GB_CSIZE2, clen) ;
                        #endif
                    }
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 11 : // I not contiguous, with duplicates. No qsort needed
                //--------------------------------------------------------------

                    // Case 11 works well when I has many entries and A(:,kA)
                    // has few entries.  It requires that I be sorted on input,
                    // so that no sort is required for C(:,kC).  It is
                    // otherwise identical to Case 10.

                    ASSERT (Ikind == GB_LIST) ;
                    for (int64_t k = 0 ; k < alen ; k++)
                    {
                        // A(i,kA) present, look it up in the I inverse buckets
                        int64_t i = GB_Ai (pA + k) ;
                        // traverse bucket i for all indices inew where
                        // i == I [inew] or where i is from a colon expression
                        GB_for_each_index_in_bucket (inew, i)
                        { 
                            ASSERT (inew >= 0 && inew < nI) ;
                            ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pA + k) ;
                            pC++ ;
                            #endif
                        }
                    }

                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                case 12 : // I not contiguous, no duplicates.  No qsort needed.
                //--------------------------------------------------------------

                    // Identical to Case 11, except GB_for_each_index_in_bucket
                    // just needs to iterate 0 or 1 times.  Works well when I
                    // has many entries and A(:,kA) has few entries.

                    ASSERT (Ikind == GB_LIST && nduplicates == 0) ;
                    for (int64_t k = 0 ; k < alen ; k++)
                    {
                        // A(i,kA) present, look it up in the I inverse buckets
                        int64_t i = GB_Ai (pA + k) ;
                        // bucket i has at most one index inew such that
                        // i == I [inew]
                        int64_t inew = Mark [i] - 1 ;
                        if (inew >= 0)
                        { 
                            ASSERT (inew >= 0 && inew < nI) ;
                            ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
                            #if defined ( GB_ANALYSIS_PHASE )
                            clen++ ;
                            #else
                            Ci [pC] = inew ;
                            GB_COPY_ENTRY (pC, pA + k) ;
                            pC++ ;
                            #endif
                        }
                    }

                    #if defined ( GB_PHASE_2_OF_2 )
                    ASSERT (pC == pC_end) ;
                    #endif
                    break ;

                //--------------------------------------------------------------
                default: ;
                //--------------------------------------------------------------
            }

            //------------------------------------------------------------------
            // final count of nnz (C (:,j))
            //------------------------------------------------------------------

            #if defined ( GB_ANALYSIS_PHASE )
            if (fine_task)
            { 
                TaskList [taskid].pC = clen ;
            }
            else
            { 
                Cp [kC] = clen ;
            }
            #endif
        }
    }

    //--------------------------------------------------------------------------
    // phase2: post sort for any vectors handled by fine tasks with method 10
    //--------------------------------------------------------------------------

    #if defined ( GB_PHASE_2_OF_2 )
    {
        if (post_sort)
        {
            int taskid ;
            #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
            for (taskid = 0 ; taskid < ntasks ; taskid++)
            {
                int64_t kC = TaskList [taskid].kfirst ;
                bool do_post_sort = (TaskList [taskid].len != 0) ;
                if (do_post_sort)
                {
                    // This is the first fine task with method 10 for C(:,kC).
                    // The vector C(:,kC) must be sorted, since method 10 left
                    // it with unsorted indices.
                    int64_t pC = Cp [kC] ;
                    int64_t clen = Cp [kC+1] - pC ;
                    #if defined ( GB_ISO_SUBREF )
                    { 
                        // iso numeric subref C=A(I,J)
                        // just sort the pattern of C(:,kC)
                        GB_qsort_1 (Ci + pC, clen) ;
                    }
                    #else
                    { 
                        // sort the pattern of C(:,kC), and the values
                        GB_qsort_1b (Ci + pC, (GB_void *) (Cx + pC*GB_CSIZE1),
                            GB_CSIZE2, clen) ;
                    }
                    #endif
                }
            }
        }
    }
    #endif

}

#undef GB_Ai
#undef GB_for_each_index_in_bucket
#undef GB_COPY_RANGE
#undef GB_COPY_ENTRY
#undef GB_CSIZE1
#undef GB_CSIZE2
#undef GB_SYMBOLIC
#undef GB_ISO_SUBREF