File: cholmod_super_symbolic.c

package info (click to toggle)
suitesparse 1%3A5.12.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 176,720 kB
  • sloc: ansic: 1,193,914; cpp: 31,704; makefile: 6,638; fortran: 1,927; java: 1,826; csh: 765; ruby: 725; sh: 529; python: 333; perl: 225; sed: 164; awk: 35
file content (992 lines) | stat: -rw-r--r-- 34,010 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
/* ========================================================================== */
/* === Supernodal/cholmod_super_symbolic ==================================== */
/* ========================================================================== */

/* -----------------------------------------------------------------------------
 * CHOLMOD/Supernodal Module. Copyright (C) 2005-2006, Timothy A. Davis
 * http://www.suitesparse.com
 * -------------------------------------------------------------------------- */

/* Supernodal symbolic analysis of the LL' factorization of A, A*A',
 * A(:,f)*A(:,f)'.
 *
 * This routine must be preceded by a simplicial symbolic analysis
 * (cholmod_rowcolcounts).  See cholmod_analyze.c for an example of how to use
 * this routine.
 *
 * The user need not call this directly; cholmod_analyze is a "simple" wrapper
 * for this routine.
 *
 * Symmetric case:
 *
 *	A is stored in column form, with entries stored in the upper triangular
 *	part.  Entries in the lower triangular part are ignored.
 *
 * Unsymmetric case:
 *
 *	A is stored in column form.  If F is equal to the transpose of A, then
 *	A*A' is analyzed.  F can include a subset of the columns of A
 *	(F=A(:,f)'), in which case F*F' is analyzed.
 *
 * Requires Parent and L->ColCount to be defined on input; these are the
 * simplicial Parent and ColCount arrays as computed by cholmod_rowcolcounts.
 * Does not use L->Perm; the input matrices A and F must already be properly
 * permuted.  Allocates and computes the supernodal pattern of L (L->super,
 * L->pi, L->px, and L->s).  Does not allocate the real part (L->x).
 *
 * Supports any xtype (pattern, real, complex, or zomplex).
 */

#ifndef NGPL
#ifndef NSUPERNODAL

#include "cholmod_internal.h"
#include "cholmod_supernodal.h"

#ifdef GPU_BLAS
#include "cholmod_gpu.h"
#endif

/* ========================================================================== */
/* === subtree ============================================================== */
/* ========================================================================== */

/* In the symmetric case, traverse the kth row subtree from the nonzeros in
 * A (0:k1-1,k) and add the new entries found to the pattern of the kth row
 * of L.  The current supernode s contains the diagonal block k1:k2-1, so it
 * can be skipped.
 *
 * In the unsymmetric case, the nonzero pattern of A*F is computed one column
 * at a time (thus, the total time spent in this function is bounded below by
 * the time taken to multiply A*F, which can be high if A is tall and thin).
 * The kth column is A*F(:,k), or the set union of all columns A(:,j) for which
 * F(j,k) is nonzero.  This routine is called once for each entry j.  Only the
 * upper triangular part is needed, so only A (0:k1-1,j) is accessed, where
 * k1:k2-1 are the columns of the current supernode s (k is in the range k1 to
 * k2-1).
 *
 * If A is sorted, then the total time taken by this function is proportional
 * to the number of nonzeros in the strictly block upper triangular part of A,
 * plus the number of entries in the strictly block lower triangular part of
 * the supernodal part of L.  This excludes entries in the diagonal blocks
 * corresponding to the columns in each supernode.  That is, if k1:k2-1 are
 * in a single supernode, then only A (0:k1-1,k1:k2-1) are accessed.
 *
 * For the unsymmetric case, only the strictly block upper triangular part
 * of A*F is constructed.
 *
 * Only adds column indices corresponding to the leading columns of each
 * relaxed supernode.
 */

static void subtree
(
    /* inputs, not modified: */
    Int j,		/* j = k for symmetric case */
    Int k,
    Int Ap [ ],
    Int Ai [ ],
    Int Anz [ ],
    Int SuperMap [ ],
    Int Sparent [ ],
    Int mark,
    Int sorted,         /* true if the columns of A are sorted */
    Int k1,             /* only consider A (0:k1-1,k) */

    /* input/output: */
    Int Flag [ ],
    Int Ls [ ],
    Int Lpi2 [ ]
)
{
    Int p, pend, i, si ;
    p = Ap [j] ;
    pend = (Anz == NULL) ? (Ap [j+1]) : (p + Anz [j]) ;

    for ( ; p < pend ; p++)
    {
	i = Ai [p] ;
	if (i < k1)
	{
	    /* (i,k) is an entry in the upper triangular part of A or A*F'.
	     * symmetric case:   A(i,k) is nonzero (j=k).
	     * unsymmetric case: A(i,j) and F(j,k) are both nonzero.
	     *
	     * Column i is in supernode si = SuperMap [i].  Follow path from si
	     * to root of supernodal etree, stopping at the first flagged
	     * supernode.  The root of the row subtree is supernode SuperMap[k],
	     * which is flagged already. This traversal will stop there, or it
	     * might stop earlier if supernodes have been flagged by previous
	     * calls to this routine for the same k. */
	    for (si = SuperMap [i] ; Flag [si] < mark ; si = Sparent [si])
	    {
		ASSERT (si <= SuperMap [k]) ;
		Ls [Lpi2 [si]++] = k ;
		Flag [si] = mark ;
	    }
	}
        else if (sorted)
        {
            break ;
        }
    }
}


/* clear workspace used by cholmod_super_symbolic */
#define FREE_WORKSPACE \
{ \
    /* CHOLMOD(clear_flag) (Common) ; */ \
    CHOLMOD_CLEAR_FLAG (Common) ; \
    for (k = 0 ; k <= nfsuper ; k++) \
    { \
	Head [k] = EMPTY ; \
    } \
    ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ; \
} \


/* ========================================================================== */
/* === cholmod_super_symbolic2 ============================================== */
/* ========================================================================== */

/* Analyze for supernodal Cholesky or multifrontal QR. */

int CHOLMOD(super_symbolic2)
(
    /* ---- input ---- */
    int for_whom,       /* FOR_SPQR     (0): for SPQR but not GPU-accelerated
                           FOR_CHOLESKY (1): for Cholesky (GPU or not)
                           FOR_SPQRGPU  (2): for SPQR with GPU acceleration */
    cholmod_sparse *A,	/* matrix to analyze */
    cholmod_sparse *F,	/* F = A' or A(:,f)' */
    Int *Parent,	/* elimination tree */
    /* ---- in/out --- */
    cholmod_factor *L,	/* simplicial symbolic on input,
			 * supernodal symbolic on output */
    /* --------------- */
    cholmod_common *Common
)
{
    double zrelax0, zrelax1, zrelax2, xxsize ;
    Int *Wi, *Wj, *Super, *Snz, *Ap, *Ai, *Flag, *Head, *Ls, *Lpi, *Lpx, *Fnz,
	*Sparent, *Anz, *SuperMap, *Merged, *Nscol, *Zeros, *Fp, *Fj,
	*ColCount, *Lpi2, *Lsuper, *Iwork ;
    Int nsuper, d, n, j, k, s, mark, parent, p, pend, k1, k2, packed, nscol,
	nsrow, ndrow1, ndrow2, stype, ssize, xsize, sparent, plast, slast,
	csize, maxcsize, ss, nscol0, nscol1, ns, nfsuper, newzeros, totzeros,
	merge, snext, esize, maxesize, nrelax0, nrelax1, nrelax2, Asorted ;
    size_t w ;
    int ok = TRUE, find_xsize ;
    const char* env_use_gpu;
    const char* env_max_bytes;
    size_t max_bytes;
    const char* env_max_fraction;
    double max_fraction;

    /* ---------------------------------------------------------------------- */
    /* check inputs */
    /* ---------------------------------------------------------------------- */

    RETURN_IF_NULL_COMMON (FALSE) ;
    RETURN_IF_NULL (A, FALSE) ;
    RETURN_IF_NULL (L, FALSE) ;
    RETURN_IF_NULL (Parent, FALSE) ;
    RETURN_IF_XTYPE_INVALID (A, CHOLMOD_PATTERN, CHOLMOD_ZOMPLEX, FALSE) ;
    RETURN_IF_XTYPE_INVALID (L, CHOLMOD_PATTERN, CHOLMOD_PATTERN, FALSE) ;
    stype = A->stype ;
    if (stype < 0)
    {
	/* invalid symmetry; symmetric lower form not supported */
	ERROR (CHOLMOD_INVALID, "symmetric lower not supported") ;
	return (FALSE) ;
    }
    if (stype == 0)
    {
	/* F must be present in the unsymmetric case */
	RETURN_IF_NULL (F, FALSE) ;
    }
    if (L->is_super)
    {
	/* L must be a simplicial symbolic factor */
	ERROR (CHOLMOD_INVALID, "L must be symbolic on input") ;
	return (FALSE) ;
    }
    Common->status = CHOLMOD_OK ;

    /* ---------------------------------------------------------------------- */
    /* allocate workspace */
    /* ---------------------------------------------------------------------- */

    n = A->nrow ;

    /* w = 5*n */
    w = CHOLMOD(mult_size_t) (n, 5, &ok) ;
    if (!ok)
    {
	ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
	return (FALSE) ;
    }

    CHOLMOD(allocate_work) (n, w, 0, Common) ;
    if (Common->status < CHOLMOD_OK)
    {
	/* out of memory */
	return (FALSE) ;
    }
    ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ;

    /* ---------------------------------------------------------------------- */
    /* allocate GPU workspace */
    /* ---------------------------------------------------------------------- */

    L->useGPU = 0 ;     /* only used for Cholesky factorization, not QR */

#ifdef GPU_BLAS

    /* GPU module is installed */
    if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY )
    {
        /* only allocate GPU workspace for supernodal Cholesky, and only when
           the GPU is requested and available. */

        max_bytes = 0;
        max_fraction = 0;

#ifdef DLONG
        if ( Common->useGPU == EMPTY )
        {
            /* useGPU not explicity requested by the user, but not explicitly
             * prohibited either.  Query OS environment variables for request.*/
            env_use_gpu  = getenv("CHOLMOD_USE_GPU");

            if ( env_use_gpu )
            {
                /* CHOLMOD_USE_GPU environment variable is set to something */
                if ( atoi ( env_use_gpu ) == 0 )
                {
                    Common->useGPU = 0; /* don't use the gpu */
                }
                else
                {
                    Common->useGPU = 1; /* use the gpu */
                    env_max_bytes = getenv("CHOLMOD_GPU_MEM_BYTES");
                    env_max_fraction = getenv("CHOLMOD_GPU_MEM_FRACTION");
                    if ( env_max_bytes )
                    {
                        max_bytes = atol(env_max_bytes);
                        Common->maxGpuMemBytes = max_bytes;
                    }
                    if ( env_max_fraction )
                    {
                        max_fraction = atof (env_max_fraction);
                        if ( max_fraction < 0 ) max_fraction = 0;
                        if ( max_fraction > 1 ) max_fraction = 1;
                        Common->maxGpuMemFraction = max_fraction;
                    }	  
                }
            }
            else
            {
                /* CHOLMOD_USE_GPU environment variable not set, so no GPU
                 * acceleration will be used */
                Common->useGPU = 0;
            }
            /* fprintf (stderr, "useGPU queried: %d\n", Common->useGPU) ; */
        }

        /* Ensure that a GPU is present */
        if ( Common->useGPU == 1 )
        {
            /* fprintf (stderr, "\nprobe GPU:\n") ; */
            Common->useGPU = CHOLMOD(gpu_probe) (Common);   // Cholesky only, not SPQR
            /* fprintf (stderr, "\nprobe GPU: result %d\n", Common->useGPU) ; */
        }

        if ( Common->useGPU == 1 )
        {
            /* Cholesky + GPU, so allocate space */
            /* fprintf (stderr, "allocate GPU:\n") ; */
            CHOLMOD(gpu_allocate) ( Common );               // Cholesky only, not SPQR
            /* fprintf (stderr, "allocate GPU done\n") ; */
        }
#else
        /* GPU acceleration is only supported for long int version */
        Common->useGPU = 0;
#endif

        /* Cache the fact that the symbolic factorization supports 
         * GPU acceleration */
        L->useGPU = Common->useGPU;

    }

#else
    /* GPU module is not installed */
    Common->useGPU = 0 ;
#endif

    /* ---------------------------------------------------------------------- */
    /* get inputs */
    /* ---------------------------------------------------------------------- */

    /* A is now either A or triu(A(p,p)) for the symmetric case.  It is either
     * A or A(p,f) for the unsymmetric case (both in column form).  It can be
     * either packed or unpacked, and either sorted or unsorted.  Entries in
     * the lower triangular part may be present if A is symmetric, but these
     * are ignored. */

    Ap = A->p ;
    Ai = A->i ;
    Anz = A->nz ;

    if (stype != 0)
    {
	/* F not accessed */
	Fp = NULL ;
	Fj = NULL ;
	Fnz = NULL ;
	packed = TRUE ;
    }
    else
    {
	/* F = A(:,f) or A(p,f) in packed row form, either sorted or unsorted */
	Fp = F->p ;
	Fj = F->i ;
	Fnz = F->nz ;
	packed = F->packed ;
    }

    ColCount = L->ColCount ;

    nrelax0 = Common->nrelax [0] ;
    nrelax1 = Common->nrelax [1] ;
    nrelax2 = Common->nrelax [2] ;

    zrelax0 = Common->zrelax [0] ;
    zrelax1 = Common->zrelax [1] ;
    zrelax2 = Common->zrelax [2] ;

    zrelax0 = IS_NAN (zrelax0) ? 0 : zrelax0 ;
    zrelax1 = IS_NAN (zrelax1) ? 0 : zrelax1 ;
    zrelax2 = IS_NAN (zrelax2) ? 0 : zrelax2 ;

    ASSERT (CHOLMOD(dump_parent) (Parent, n, "Parent", Common)) ;

    /* ---------------------------------------------------------------------- */
    /* get workspace */
    /* ---------------------------------------------------------------------- */

    /* Sparent, Snz, and Merged could be allocated later, of size nfsuper */

    Iwork = Common->Iwork ;
    Wi      = Iwork ;	    /* size n (i/l/l).  Lpi2 is i/l/l */
    Wj      = Iwork + n ;   /* size n (i/l/l).  Zeros is i/l/l */
    Sparent = Iwork + 2*((size_t) n) ; /* size nfsuper <= n [ */
    Snz     = Iwork + 3*((size_t) n) ; /* size nfsuper <= n [ */
    Merged  = Iwork + 4*((size_t) n) ; /* size nfsuper <= n [ */

    Flag = Common->Flag ;   /* size n */
    Head = Common->Head ;   /* size n+1 */

    /* ---------------------------------------------------------------------- */
    /* find the fundamental supernodes */
    /* ---------------------------------------------------------------------- */

    /* count the number of children of each node, using Wi [ */
    for (j = 0 ; j < n ; j++)
    {
	Wi [j] = 0 ;
    }
    for (j = 0 ; j < n ; j++)
    {
	parent = Parent [j] ;
	if (parent != EMPTY)
	{
	    Wi [parent]++ ;
	}
    }

    Super = Head ;  /* use Head [0..nfsuper] as workspace for Super list ( */

    /* column 0 always starts a new supernode */
    nfsuper = (n == 0) ? 0 : 1 ;	/* number of fundamental supernodes */
    Super [0] = 0 ;

    for (j = 1 ; j < n ; j++)
    {
	/* check if j starts new supernode, or in the same supernode as j-1 */
	if (Parent [j-1] != j	    /* parent of j-1 is not j */
	    || (ColCount [j-1] != ColCount [j] + 1) /* j-1 not subset of j*/
	    || Wi [j] > 1	    /* j has more than one child */
#ifdef GPU_BLAS
	    /* Ensure that the supernode will fit in the GPU buffers */
	    /* Data size of 16 bytes must be assumed for case of PATTERN */
	    || (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU && 
		 (j-Super[nfsuper-1]+1) * 
		 ColCount[Super[nfsuper-1]] * sizeof(double) * 2 >= 
		 Common->devBuffSize)
#endif
	    )
	{
	    /* j is the leading node of a supernode */
	    Super [nfsuper++] = j ;
	}
    }
    Super [nfsuper] = n ;

    /* contents of Wi no longer needed for child count ] */

    Nscol = Wi ; /* use Wi as size-nfsuper workspace for Nscol [ */

    /* ---------------------------------------------------------------------- */
    /* find the mapping of fundamental nodes to supernodes */
    /* ---------------------------------------------------------------------- */

    SuperMap = Wj ;	/* use Wj as workspace for SuperMap [ */

    /* SuperMap [k] = s if column k is contained in supernode s */
    for (s = 0 ; s < nfsuper ; s++)
    {
	for (k = Super [s] ; k < Super [s+1] ; k++)
	{
	    SuperMap [k] = s ;
	}
    }

    /* ---------------------------------------------------------------------- */
    /* construct the fundamental supernodal etree */
    /* ---------------------------------------------------------------------- */

    for (s = 0 ; s < nfsuper ; s++)
    {
	j = Super [s+1] - 1 ;	/* last node in supernode s */
	parent = Parent [j] ;	/* parent of last node */
	Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
	PRINT1 (("Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
    }

    /* contents of Wj no longer needed as workspace for SuperMap ]
     * SuperMap will be recomputed below, for the relaxed supernodes. */

    Zeros = Wj ;   /* use Wj for Zeros, workspace of size nfsuper [ */

    /* ---------------------------------------------------------------------- */
    /* relaxed amalgamation */
    /* ---------------------------------------------------------------------- */

    for (s = 0 ; s < nfsuper ; s++)
    {
	Merged [s] = EMPTY ;			/* s not merged into another */
	Nscol [s] = Super [s+1] - Super [s] ;	/* # of columns in s */
	Zeros [s] = 0 ;				/* # of zero entries in s */
	ASSERT (s <= Super [s]) ;
	Snz [s] = ColCount [Super [s]] ;  /* # of entries in leading col of s */
	PRINT2 (("lnz ["ID"] "ID"\n", s, Snz [s])) ;
    }

    for (s = nfsuper-2 ; s >= 0 ; s--)
    {
        double lnz1 ;

	/* should supernodes s and s+1 merge into a new node s? */
	PRINT1 (("\n========= Check relax of s "ID" and s+1 "ID"\n", s, s+1)) ;

	ss = Sparent [s] ;
	if (ss == EMPTY)
	{
	    PRINT1 (("s "ID" is a root, no merge with s+1 = "ID"\n", s, s+1)) ;
	    continue ;
	}

	/* find the current parent of s (perform path compression as needed) */
	for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = Merged [ss]) ;
	sparent = ss ;
	PRINT2 (("Current sparent of s "ID" is "ID"\n", s, sparent)) ;

	/* ss is the current parent of s */
	for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = snext)
	{
	    snext = Merged [ss] ;
	    PRINT2 (("ss "ID" is dead, merged into snext "ID"\n", ss, snext)) ;
	    Merged [ss] = sparent ;
	}

	/* if s+1 is not the current parent of s, do not merge */
	if (sparent != s+1)
	{
	    continue ;
	}

	nscol0 = Nscol [s] ;	/* # of columns in s */
	nscol1 = Nscol [s+1] ;	/* # of columns in s+1 */
	ns = nscol0 + nscol1 ;
	PRINT2 (("ns "ID" nscol0 "ID" nscol1 "ID"\n", ns, nscol0, nscol1)) ;

	totzeros = Zeros [s+1] ;	/* current # of zeros in s+1 */
	lnz1 = (double) (Snz [s+1]) ;	/* # entries in leading column of s+1 */

	/* determine if supernodes s and s+1 should merge */
	if (ns <= nrelax0)
	{
	    PRINT2 (("ns is tiny ("ID"), so go ahead and merge\n", ns)) ;
	    merge = TRUE ;
	}
	else
	{
	    /* use double to avoid integer overflow */
	    double lnz0 = Snz [s] ;	/* # entries in leading column of s */
	    double xnewzeros = nscol0 * (lnz1 + nscol0 - lnz0) ;

	    /* use Int for the final update of Zeros [s] below */
	    newzeros = nscol0 * (Snz [s+1] + nscol0 - Snz [s]) ;
	    ASSERT (newzeros == xnewzeros) ;

	    PRINT2 (("lnz0 %g lnz1 %g xnewzeros %g\n", lnz0, lnz1, xnewzeros)) ;
	    if (xnewzeros == 0)
	    {
		/* no new zeros, so go ahead and merge */
		PRINT2 (("no new fillin, so go ahead and merge\n")) ;
		merge = TRUE ;
	    }
	    else
	    {
		/* # of zeros if merged */
		double xtotzeros = ((double) totzeros) + xnewzeros ;

		/* xtotsize: total size of merged supernode, if merged: */
		double xns = (double) ns ;
		double xtotsize  = (xns * (xns+1) / 2) + xns * (lnz1 - nscol1) ;
		double z = xtotzeros / xtotsize ;

		Int totsize ;
		totsize  = (ns * (ns+1) / 2) + ns * (Snz [s+1] - nscol1) ;

		PRINT2 (("oldzeros "ID" newzeros "ID" xtotsize %g z %g\n",
			    Zeros [s+1], newzeros, xtotsize, z)) ;

		/* use Int for the final update of Zeros [s] below */
		totzeros += newzeros ;

		/* do not merge if supernode would become too big
		 * (Int overflow).  Continue computing; not (yet) an error. */
		/* fl.pt. compare, but no NaN's can occur here */
		merge = ((ns <= nrelax1 && z < zrelax0) ||
			 (ns <= nrelax2 && z < zrelax1) ||
					  (z < zrelax2)) &&
			(xtotsize < Int_max / sizeof (double)) ;

	    }
	}

#ifdef GPU_BLAS
	if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU ) {
	  /* Ensure that the aggregated supernode fits in the device 
	     supernode buffers */
	  double xns = (double) ns;
	  if ( ((xns * xns) + xns * (lnz1 - nscol1))*sizeof(double)*2  >= 
	       Common->devBuffSize ) {
	    merge = FALSE;
	  }
	}
#endif

	if (merge)
	{
	    PRINT1 (("Merge node s ("ID") and s+1 ("ID")\n", s, s+1)) ;
	    Zeros [s] = totzeros ;
	    Merged [s+1] = s ;
	    Snz [s] = nscol0 + Snz [s+1] ;
	    Nscol [s] += Nscol [s+1] ;
	}
    }

    /* contents of Wj no longer needed for Zeros ] */
    /* contents of Wi no longer needed for Nscol ] */
    /* contents of Sparent no longer needed (recomputed below) */

    /* ---------------------------------------------------------------------- */
    /* construct the relaxed supernode list */
    /* ---------------------------------------------------------------------- */

    nsuper = 0 ;
    for (s = 0 ; s < nfsuper ; s++)
    {
	if (Merged [s] == EMPTY)
	{
	    PRINT1 (("live supernode: "ID" snz "ID"\n", s, Snz [s])) ;
	    Super [nsuper] = Super [s] ;
	    Snz [nsuper] = Snz [s] ;
	    nsuper++ ;
	}
    }
    Super [nsuper] = n ;
    PRINT1 (("Fundamental supernodes: "ID"  relaxed "ID"\n", nfsuper, nsuper)) ;

    /* Merged no longer needed ] */

    /* ---------------------------------------------------------------------- */
    /* find the mapping of relaxed nodes to supernodes */
    /* ---------------------------------------------------------------------- */

    /* use Wj as workspace for SuperMap { */

    /* SuperMap [k] = s if column k is contained in supernode s */
    for (s = 0 ; s < nsuper ; s++)
    {
	for (k = Super [s] ; k < Super [s+1] ; k++)
	{
	    SuperMap [k] = s ;
	}
    }

    /* ---------------------------------------------------------------------- */
    /* construct the relaxed supernodal etree */
    /* ---------------------------------------------------------------------- */

    for (s = 0 ; s < nsuper ; s++)
    {
	j = Super [s+1] - 1 ;	/* last node in supernode s */
	parent = Parent [j] ;	/* parent of last node */
	Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
	PRINT1 (("new Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
    }

    /* ---------------------------------------------------------------------- */
    /* determine the size of L->s and L->x */
    /* ---------------------------------------------------------------------- */

    ssize = 0 ;
    xsize = 0 ;
    xxsize = 0 ;
    find_xsize = for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
                 for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU ;
    for (s = 0 ; s < nsuper ; s++)
    {
	nscol = Super [s+1] - Super [s] ;
	nsrow = Snz [s] ;
	ASSERT (nscol > 0) ;
	ssize += nsrow ;
        if (find_xsize)
        {
            xsize += nscol * nsrow ;
            /* also compute xsize in double to guard against Int overflow */
            xxsize += ((double) nscol) * ((double) nsrow) ;
        }
	if (ssize < 0 ||(find_xsize && xxsize > Int_max))
	{
	    /* Int overflow, clear workspace and return.
               QR factorization will not use xxsize, so that error is ignored.
               For Cholesky factorization, however, memory of space xxsize
               will be allocated, so this is a failure.  Both QR and Cholesky
               fail if ssize overflows. */
	    ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
	    FREE_WORKSPACE ;
	    return (FALSE) ;
	}
	ASSERT (ssize > 0) ;
        ASSERT (IMPLIES (find_xsize, xsize > 0)) ;
    }
    xsize = MAX (1, xsize) ;
    ssize = MAX (1, ssize) ;
    PRINT1 (("ix sizes: "ID" "ID" nsuper "ID"\n", ssize, xsize, nsuper)) ;

    /* ---------------------------------------------------------------------- */
    /* allocate L (all except real part L->x) */
    /* ---------------------------------------------------------------------- */

    L->ssize = ssize ;
    L->xsize = xsize ;
    L->nsuper = nsuper ;

    CHOLMOD(change_factor) (CHOLMOD_PATTERN, TRUE, TRUE, TRUE, TRUE, L, Common);

    if (Common->status < CHOLMOD_OK)
    {
	/* out of memory; L is still a valid simplicial symbolic factor */
	FREE_WORKSPACE ;
	return (FALSE) ;
    }

    DEBUG (CHOLMOD(dump_factor) (L, "L to symbolic super", Common)) ;
    ASSERT (L->is_ll && L->xtype == CHOLMOD_PATTERN && L->is_super) ;

    Lpi = L->pi ;
    Lpx = L->px ;
    Ls = L->s ;
    Ls [0] = 0 ;    /* flag for cholmod_check_factor; supernodes are defined */
    Lsuper = L->super ;

    /* copy the list of relaxed supernodes into the final list in L */
    for (s = 0 ; s <= nsuper ; s++)
    {
	Lsuper [s] = Super [s] ;
    }

    /* Head no longer needed as workspace for fundamental Super list ) */

    Super = Lsuper ;	    /* Super is now the list of relaxed supernodes */

    /* ---------------------------------------------------------------------- */
    /* construct column pointers of relaxed supernodal pattern (L->pi) */
    /* ---------------------------------------------------------------------- */

    p = 0 ;
    for (s = 0 ; s < nsuper ; s++)
    {
	Lpi [s] = p ;
	p += Snz [s] ;
	PRINT1 (("Snz ["ID"] = "ID", Super ["ID"] = "ID"\n",
		    s, Snz [s], s, Super[s])) ;
    }
    Lpi [nsuper] = p ;
    ASSERT ((Int) (L->ssize) == MAX (1,p)) ;

    /* ---------------------------------------------------------------------- */
    /* construct pointers for supernodal values (L->px) */
    /* ---------------------------------------------------------------------- */

    if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
        for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
    {
        Lpx [0] = 0 ;
        p = 0 ;
        for (s = 0 ; s < nsuper ; s++)
        {
            nscol = Super [s+1] - Super [s] ;   /* number of columns in s */
            nsrow = Snz [s] ;           /* # of rows, incl triangular part*/
            Lpx [s] = p ;               /* pointer to numerical part of s */
            p += nscol * nsrow ;
        }
        Lpx [s] = p ;
        ASSERT ((Int) (L->xsize) == MAX (1,p)) ;
    }
    else
    {
        /* L->px is not needed for non-GPU accelerated QR factorization (it may
         * lead to Int overflow, anyway, if xsize caused Int overflow above).
         * Use a magic number to tell cholmod_check_factor to ignore Lpx. */
        Lpx [0] = 123456 ;
    }

    /* Snz no longer needed ] */

    /* ---------------------------------------------------------------------- */
    /* symbolic analysis to construct the relaxed supernodal pattern (L->s) */
    /* ---------------------------------------------------------------------- */

    Lpi2 = Wi ;	    /* copy Lpi into Lpi2, using Wi as workspace for Lpi2 [ */
    for (s = 0 ; s < nsuper ; s++)
    {
	Lpi2 [s] = Lpi [s] ;
    }

    Asorted = A->sorted ;

    for (s = 0 ; s < nsuper ; s++)
    {
	/* sth supernode is in columns k1 to k2-1.
	 * compute nonzero pattern of L (k1:k2-1,:). */

	/* place rows k1 to k2-1 in leading column of supernode s */
	k1 = Super [s] ;
	k2 = Super [s+1] ;
	PRINT1 (("=========>>> Supernode "ID" k1 "ID" k2-1 "ID"\n",
		    s, k1, k2-1)) ;
	for (k = k1 ; k < k2 ; k++)
	{
	    Ls [Lpi2 [s]++] = k ;
	}

	/* compute nonzero pattern each row k1 to k2-1 */
	for (k = k1 ; k < k2 ; k++)
	{
	    /* compute row k of L.  In the symmetric case, the pattern of L(k,:)
	     * is the set of nodes reachable in the supernodal etree from any
	     * row i in the nonzero pattern of A(0:k,k).  In the unsymmetric
	     * case, the pattern of the kth column of A*A' is the set union
	     * of all columns A(0:k,j) for each nonzero F(j,k). */

	    /* clear the Flag array and mark the current supernode */
	    /* mark = CHOLMOD(clear_flag) (Common) ; */
	    CHOLMOD_CLEAR_FLAG (Common) ;
	    mark = Common->mark ;
	    Flag [s] = mark ;
	    ASSERT (s == SuperMap [k]) ;

	    /* traverse the row subtree for each nonzero in A or AA' */
	    if (stype != 0)
	    {
		subtree (k, k, Ap, Ai, Anz, SuperMap, Sparent, mark,
                        Asorted, k1, Flag, Ls, Lpi2) ;
	    }
	    else
	    {
		/* for each j nonzero in F (:,k) do */
		p = Fp [k] ;
		pend = (packed) ? (Fp [k+1]) : (p + Fnz [k]) ;
		for ( ; p < pend ; p++)
		{
		    subtree (Fj [p], k, Ap, Ai, Anz, SuperMap, Sparent, mark,
			    Asorted, k1, Flag, Ls, Lpi2) ;
		}
	    }
	}
    }
#ifndef NDEBUG
    for (s = 0 ; s < nsuper ; s++)
    {
	PRINT1 (("Lpi2[s] "ID" Lpi[s+1] "ID"\n", Lpi2 [s], Lpi [s+1])) ;
	ASSERT (Lpi2 [s] == Lpi [s+1]) ;
	CHOLMOD(dump_super) (s, Super, Lpi, Ls, NULL, NULL, 0, Common) ;
    }
#endif

    /* contents of Wi no longer needed for Lpi2 ] */
    /* Sparent no longer needed ] */

    /* ---------------------------------------------------------------------- */
    /* determine the largest update matrix (L->maxcsize) */
    /* ---------------------------------------------------------------------- */

    /* maxcsize could be determined before L->s is allocated and defined, which
     * would mean that all memory requirements for both the symbolic and numeric
     * factorizations could be computed using O(nnz(A)+O(n)) space.  However, it
     * would require a lot of extra work.  The analysis phase, above, would need
     * to be duplicated, but with Ls not kept; instead, the algorithm would keep
     * track of the current s and slast for each supernode d, and update them
     * when a new row index appears in supernode d.  An alternative would be to
     * do this computation only if the allocation of L->s failed, in which case
     * the following code would be skipped.
     *
     * The csize for a supernode is the size of its largest contribution to
     * a subsequent ancestor supernode.  For example, suppose the rows of #'s
     * in the figure below correspond to the columns of a subsequent supernode,
     * and the dots are the entries in that ancestore.
     *
     *	    c
     *	    c c
     *	    c c c
     *	    x x x
     *	    x x x
     *	    # # #   .
     *	    # # #   . .
     *	    * * *   . .
     *	    * * *   . .
     *	    * * *   . .
     *	            . .
     *
     * Then for this update, the csize is 3-by-2, or 6, because there are 3
     * rows of *'s which is the number of rows in the update, and there are
     * 2 rows of #'s, which is the number columns in the update.  The csize
     * of a supernode is the largest such contribution for any ancestor
     * supernode.  maxcsize, for the whole matrix, has a rough upper bound of
     * the maximum size of any supernode.  This bound is loose, because the
     * the contribution must be less than the size of the ancestor supernodal
     * that it's updating.  maxcsize of a completely dense matrix, with one
     * supernode, is zero.
     *
     * maxesize is the column dimension for the workspace E needed for the
     * solve.  E is of size nrhs-by-maxesize, where the nrhs is the number of
     * columns in the right-hand-side.  The maxesize is the largest esize of
     * any supernode.  The esize of a supernode is the number of row indices
     * it contains, excluding the column indices of the supernode itself.
     * For the following example, esize is 4:
     *
     *	    c
     *	    c c
     *	    c c c
     *	    x x x
     *	    x x x
     *	    x x x
     *	    x x x
     *
     * maxesize can be no bigger than n.
     */

    maxcsize = 1 ;
    maxesize = 1 ;

    /* Do not need to guard csize against Int overflow since xsize is OK. */

    if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
        for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
    {
        /* this is not needed for non-GPU accelerated QR factorization */
        for (d = 0 ; d < nsuper ; d++)
        {
            nscol = Super [d+1] - Super [d] ;
            p = Lpi [d] + nscol ;
            plast = p ;
            pend = Lpi [d+1] ;
            esize = pend - p ;
            maxesize = MAX (maxesize, esize) ;
            slast = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
            for ( ; p <= pend ; p++)
            {
                s = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
                if (s != slast)
                {
                    /* row i is the start of a new supernode */
                    ndrow1 = p - plast ;
                    ndrow2 = pend - plast ;
                    csize = ndrow2 * ndrow1 ;
                    PRINT1 (("Supernode "ID" ancestor "ID" C: "ID"-by-"ID
                        "  csize "ID"\n", d, slast, ndrow1, ndrow2, csize)) ;
                    maxcsize = MAX (maxcsize, csize) ;
                    plast = p ;
                    slast = s ;
                }
            }
        }
        PRINT1 (("max csize "ID"\n", maxcsize)) ;
    }

    /* Wj no longer needed for SuperMap } */

    L->maxcsize = maxcsize ;
    L->maxesize = maxesize ;
    L->is_super = TRUE ;
    ASSERT (L->xtype == CHOLMOD_PATTERN && L->is_ll) ;

    /* ---------------------------------------------------------------------- */
    /* supernodal symbolic factorization is complete */
    /* ---------------------------------------------------------------------- */

    FREE_WORKSPACE ;
    return (TRUE) ;
}

/* ========================================================================== */
/* === cholmod_super_symbolic =============================================== */
/* ========================================================================== */

/* Analyzes A, AA', or A(:,f)*A(:,f)' in preparation for a supernodal numeric
 * factorization.  The user need not call this directly; cholmod_analyze is
 * a "simple" wrapper for this routine.
 * 
 * This function does all the analysis for a supernodal Cholesky factorization.
 *
 * workspace: Flag (nrow), Head (nrow), Iwork (2*nrow),
 * and temporary space of size 3*nfsuper*sizeof(Int), where nfsuper <= n
 * is the number of fundamental supernodes.
 */

int CHOLMOD(super_symbolic)
(
    /* ---- input ---- */
    cholmod_sparse *A,	/* matrix to analyze */
    cholmod_sparse *F,	/* F = A' or A(:,f)' */
    Int *Parent,	/* elimination tree */
    /* ---- in/out --- */
    cholmod_factor *L,	/* simplicial symbolic on input,
			 * supernodal symbolic on output */
    /* --------------- */
    cholmod_common *Common
)
{
    return (CHOLMOD(super_symbolic2) (CHOLMOD_ANALYZE_FOR_CHOLESKY,
        A, F, Parent, L, Common)) ;
}
#endif
#endif