1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
|
#ifndef __PLINK2_MATRIX_H__
#define __PLINK2_MATRIX_H__
// This file is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
// Wrappers for frequent LAPACK calls (sometimes with no-LAPACK fallbacks).
// Now supports MKL backend.
// todo: allow this to take advantage of 64-bit integer LAPACK. As of this
// writing, it's available on Amazon EC2 64-bit Linux instances, but I can't
// find it for Windows. (And even if OS X vecLib adds it soon, we can't use it
// there anytime soon because static linking is not an option.)
// #include "plink2_cmdline.h"
#include "include/plink2_bits.h"
#ifdef NOLAPACK
typedef double MatrixInvertBuf1;
CONSTI32(kMatrixInvertBuf1ElemAlloc, 2 * sizeof(double));
CONSTI32(kMatrixInvertBuf1CheckedAlloc, 2 * sizeof(double));
# define __CLPK_integer int
#else // not NOLAPACK
# ifdef __APPLE__
// Make -DLAPACK_ILP64 and -DACCELERATE_LAPACK_ILP64 have the same effect.
# if defined(LAPACK_ILP64) && !defined(ACCELERATE_LAPACK_ILP64)
# define ACCELERATE_LAPACK_ILP64
# endif
# if defined(ACCELERATE_LAPACK_ILP64) && !defined(LAPACK_ILP64)
# define LAPACK_ILP64
# endif
# endif
# ifdef DYNAMIC_MKL
# define USE_MKL
# endif
# ifdef USE_MKL
# ifdef __APPLE__
# error "plink2 cannot currently use MKL on OS X."
# endif
# ifdef LAPACK_ILP64
# define MKL_ILP64
# endif
# ifdef DYNAMIC_MKL
# include <mkl_service.h>
# else
// If this isn't initially found, use the compiler's -I option to specify the
// appropriate include-file directory. A common location is
// /opt/intel/mkl/include .
// If this isn't installed at all on your system but you want/need to change
// that, see the instructions at
// https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo
# include "mkl_service.h"
# endif
# define USE_MTBLAS
// this technically doesn't have to be a macro, but it's surrounded by other
// things which do have to be macros, so changing this to a namespaced function
// arguably *decreases* overall readability...
# define BLAS_SET_NUM_THREADS mkl_set_num_threads
# else
# ifdef USE_OPENBLAS
# ifdef __cplusplus
extern "C" {
# endif
void openblas_set_num_threads(int num_threads);
# ifdef __cplusplus
} // extern "C"
# endif
# define USE_MTBLAS
# define BLAS_SET_NUM_THREADS openblas_set_num_threads
# else
# ifndef ACCELERATE_NEW_LAPACK
# define BLAS_SET_NUM_THREADS(num)
# endif
# endif
# endif
# ifdef __APPLE__
# ifdef ACCELERATE_NEW_LAPACK
# ifdef LAPACK_ILP64
// unfortunately, compiler complains about incompatible pointer types if we
// use the less-ambiguous "long long" or "int64_t" here
typedef long __CLPK_integer;
static_assert(sizeof(long) == 8, "Unexpected 'long' type size.");
# else
typedef int32_t __CLPK_integer;
# endif
# else // !ACCELERATE_NEW_LAPACK
# ifdef LAPACK_ILP64
# error "LAPACK_ILP64 requires ACCELERATE_NEW_LAPACK on macOS"
# endif
# endif
# include <Accelerate/Accelerate.h>
# define USE_CBLAS_XGEMM
# if defined(ACCELERATE_NEW_LAPACK) && !defined(USE_OPENBLAS)
HEADER_INLINE void BLAS_SET_NUM_THREADS(__attribute__((unused)) int num_threads) {
if (__builtin_available(macOS 15.0, *)) {
if (num_threads > 1) {
BLASSetThreading(BLAS_THREADING_MULTI_THREADED);
} else {
BLASSetThreading(BLAS_THREADING_SINGLE_THREADED);
}
}
}
# endif
# elif defined(USE_AOCL)
# define USE_CBLAS_XGEMM
# endif
# ifndef __APPLE__
# ifdef __cplusplus
extern "C" {
# endif
typedef double __CLPK_doublereal;
# ifdef __LP64__
# ifdef LAPACK_ILP64
typedef long long __CLPK_integer;
# else
typedef int32_t __CLPK_integer;
# endif
# else
# ifdef LAPACK_ILP64
# error "Invalid compile flags."
# else
# ifdef _WIN32
// probably don't need this?
typedef int32_t __CLPK_integer;
# else
typedef long int __CLPK_integer;
# endif
# endif
# endif // !__LP64__
# ifdef _WIN32
// openblas is easy enough to set up on Windows nowadays.
// not worth the trouble of ripping out vector extensions, etc. just so we
// can compile with Visual Studio and gain access to MKL
# ifndef USE_OPENBLAS
# error "Windows build currently requires OpenBLAS's LAPACK."
# endif
# define HAVE_LAPACK_CONFIG_H
# define LAPACK_COMPLEX_STRUCTURE
# include "lapacke.h"
__CLPK_doublereal ddot_(__CLPK_integer* n, __CLPK_doublereal* dx,
__CLPK_integer* incx, __CLPK_doublereal* dy,
__CLPK_integer* incy);
__CLPK_doublereal sdot_(__CLPK_integer* n, float* sx, __CLPK_integer* incx,
float* sy, __CLPK_integer* incy);
# else // Linux
# ifdef USE_MKL
# define USE_CBLAS_XGEMM
# ifdef DYNAMIC_MKL
# include <mkl_cblas.h>
# include <mkl_lapack.h>
# else
# include "mkl_cblas.h"
# include "mkl_lapack.h"
# endif
static_assert(sizeof(MKL_INT) == 8, "Unexpected MKL_INT size.");
# else
// If you want 64-bit index support, but not MKL (e.g. you're targeting an
// AMD processor), modify the Makefile to link to a LAPACK library recompiled
// with -fdefault-integer-8.
# ifdef USE_CBLAS_XGEMM
# include <cblas.h>
int dpotri_(char* uplo, __CLPK_integer* n, __CLPK_doublereal* a,
__CLPK_integer* lda, __CLPK_integer* info);
# else
// ARGH
// cmake on Ubuntu 14 seems to require use of cblas_f77.h instead of
// cblas.h. Conversely, cblas_f77.h does not seem to be available on
// the Scientific Linux ATLAS/LAPACK install, and right now that's my
// only option for producing 32-bit static builds...
// So. Default include is cblas.h. To play well with cmake + Ubuntu
// 14 and 16 simultaneously, there is a CBLAS_F77_ON_OLD_GCC mode
// which picks cblas_f77.h on Ubuntu 14 and cblas.h on 16.
# ifdef FORCE_CBLAS_F77
# include <cblas_f77.h>
# elif !defined(CBLAS_F77_ON_OLD_GCC)
# include <cblas.h>
# else
# if (__GNUC__ == 4)
# include <cblas_f77.h>
# else
# if __has_include(<cblas.h>)
# include <cblas.h>
# else
# include <cblas_f77.h>
# endif
# endif
# endif
__CLPK_doublereal ddot_(__CLPK_integer* n, __CLPK_doublereal* dx,
__CLPK_integer* incx, __CLPK_doublereal* dy,
__CLPK_integer* incy);
__CLPK_doublereal sdot_(__CLPK_integer* n, float* sx, __CLPK_integer* incx,
float* sy, __CLPK_integer* incy);
int dpotri_(char* uplo, __CLPK_integer* n, __CLPK_doublereal* a,
__CLPK_integer* lda, __CLPK_integer* info);
# endif
# endif // !USE_MKL
# ifdef USE_CUDA
# include "cuda/plink2_matrix_cuda.h"
# endif
# endif
void xerbla_(void);
# ifdef __cplusplus
} // extern "C"
# endif
# endif // !__APPLE__
typedef __CLPK_integer MatrixInvertBuf1;
// need to be careful about >= 2^32?
CONSTI32(kMatrixInvertBuf1ElemAlloc, sizeof(__CLPK_integer));
// invert_matrix_checked() usually requires a larger buffer
CONSTI32(kMatrixInvertBuf1CheckedAlloc, 2 * sizeof(__CLPK_integer));
#endif // !NOLAPACK
#ifdef __cplusplus
namespace plink2 {
#endif
static const double kMatrixSingularRcond = 1e-14;
// Returns -1 if no inf/nan found.
// May move this to a more central location if there are other users.
intptr_t FirstInfOrNan(const double* vec, uintptr_t size);
uint32_t LowerTriangularFirstInfOrNan(const double* matrix, uintptr_t dim, uintptr_t* row_idx_ptr, uintptr_t* col_idx_ptr);
// Copies (C-order) lower-left to upper right.
void ReflectMatrix(uint32_t dim, double* matrix);
void ReflectStridedMatrix(uint32_t dim, uint32_t stride, double* matrix);
void ReflectFmatrix(uint32_t dim, uint32_t stride, float* matrix);
// If dim < stride, this zeroes out the trailing elements of each row.
void ReflectStridedMatrix0(uint32_t dim, uint32_t stride, double* matrix);
void ReflectFmatrix0(uint32_t dim, uint32_t stride, float* matrix);
// AddDVec and AddFVec execute main += arg, under the assumption that both
// vectors have been zero-padded.
// "ctav" is the float-length of both vectors, rounded up to a vector boundary.
// (With the current implementation, it's harmless to just pass in ct instead,
// but ctav is better practice since it indicates awareness of what this
// function is actually doing.)
HEADER_INLINE void AddDVec(const double* arg, uintptr_t ctav, double* main) {
// LEA instruction supports multiply by 4 or 8, but *not* by 16 or 32, so
// this awkward loop tends to generate slightly better code than the more
// natural vector-based loop.
for (uintptr_t ulii = 0; ulii < ctav; ulii += kDoublePerDVec) {
*R_CAST(VecD*, &(main[ulii])) += *R_CAST(const VecD*, &(arg[ulii]));
}
}
HEADER_INLINE void AddFVec(const float* arg, uintptr_t ctav, float* main) {
for (uintptr_t ulii = 0; ulii < ctav; ulii += kFloatPerFVec) {
*R_CAST(VecF*, &(main[ulii])) += *R_CAST(const VecF*, &(arg[ulii]));
}
}
#ifdef __LP64__
// FillDVec sets all elements of dst to dxx, and zero-fills any trailing
// elements (w.r.t. vector boundaries).
// bugfix (21 Nov 2023): forgot about trailing zero-fill.
HEADER_INLINE void FillDVec(uintptr_t ct, double dxx, double* dst) {
for (uintptr_t ulii = 0; ulii != ct; ++ulii) {
dst[ulii] = dxx;
}
const uintptr_t remainder = ct % kDoublePerDVec;
if (remainder) {
const uintptr_t ctav = ct + kDoublePerDVec - remainder;
for (uintptr_t ulii = ct; ulii != ctav; ++ulii) {
dst[ulii] = 0.0;
}
}
}
#else
HEADER_INLINE void FillDVec(uintptr_t ct, double dxx, double* dst) {
for (uintptr_t ulii = 0; ulii != ct; ++ulii) {
dst[ulii] = dxx;
}
}
#endif
#ifdef __LP64__
// FillFVec sets all elements of dst to fxx, and zero-fills any trailing
// elements (w.r.t. vector boundaries).
HEADER_INLINE void FillFVec(uintptr_t ct, float fxx, float* dst) {
const uintptr_t fullvec_ct = ct / kFloatPerFVec;
const VecF vfill = VCONST_F(fxx);
VecF* dst_alias = R_CAST(VecF*, dst);
for (uintptr_t ulii = 0; ulii != fullvec_ct; ++ulii) {
dst_alias[ulii] = vfill;
}
const uintptr_t trailing_start_idx = fullvec_ct * kFloatPerFVec;
if (trailing_start_idx != ct) {
for (uintptr_t ulii = trailing_start_idx; ulii != ct; ++ulii) {
dst[ulii] = fxx;
}
const uintptr_t trailing_stop_idx = trailing_start_idx + kFloatPerFVec;
for (uintptr_t ulii = ct; ulii != trailing_stop_idx; ++ulii) {
dst[ulii] = S_CAST(float, 0.0);
}
}
}
#else
HEADER_INLINE void FillFVec(uintptr_t ct, float fxx, float* dst) {
for (uintptr_t ulii = 0; ulii != ct; ++ulii) {
dst[ulii] = fxx;
}
}
#endif
HEADER_INLINE double DotprodDShort(const double* vec1, const double* vec2, uint32_t ct) {
double dotprod = 0.0;
for (uint32_t uii = 0; uii != ct; ++uii) {
dotprod += vec1[uii] * vec2[uii];
}
return dotprod;
}
HEADER_INLINE float DotprodFShort(const float* vec1, const float* vec2, uint32_t ct) {
float dotprod = 0.0;
for (uint32_t uii = 0; uii != ct; ++uii) {
dotprod += vec1[uii] * vec2[uii];
}
return dotprod;
}
// todo: benchmark again after Spectre/Meltdown mitigation is deployed
CONSTI32(kDotprodDThresh, 17);
CONSTI32(kDotprodFThresh, 15);
#ifdef NOLAPACK
HEADER_INLINE double DotprodD(const double* vec1, const double* vec2, uint32_t ct) {
return DotprodDShort(vec1, vec2, ct);
}
HEADER_INLINE double DotprodxD(const double* vec1, const double* vec2, uint32_t ct) {
return DotprodDShort(vec1, vec2, ct);
}
HEADER_INLINE float DotprodF(const float* vec1, const float* vec2, uint32_t ct) {
return DotprodFShort(vec1, vec2, ct);
}
BoolErr InvertMatrix(int32_t dim, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf);
HEADER_INLINE BoolErr InvertMatrixChecked(int32_t dim, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
return InvertMatrix(dim, matrix, dbl_1d_buf, dbl_2d_buf);
}
HEADER_INLINE BoolErr InvertSymmdefMatrix(int32_t dim, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
ReflectMatrix(dim, matrix);
return InvertMatrix(dim, matrix, dbl_1d_buf, dbl_2d_buf);
}
HEADER_INLINE BoolErr InvertSymmdefMatrixChecked(int32_t dim, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
ReflectMatrix(dim, matrix);
return InvertMatrix(dim, matrix, dbl_1d_buf, dbl_2d_buf);
}
// first half computes either LU or singular value decomposition, and
// determinant
// second half actually inverts matrix, assuming 1d_buf and 2d_buf have results
// from first half
BoolErr InvertStridedMatrixFirstHalf(int32_t dim, int32_t stride, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf);
HEADER_INLINE BoolErr InvertSymmdefMatrixFirstHalf(int32_t dim, int32_t stride, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
ReflectStridedMatrix(dim, stride, matrix);
return InvertStridedMatrixFirstHalf(dim, stride, matrix, dbl_1d_buf, dbl_2d_buf);
}
BoolErr InvertFmatrixFirstHalf(int32_t dim, uint32_t stride, const float* matrix, double* half_inverted, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf);
HEADER_INLINE BoolErr InvertSymmdefFmatrixFirstHalf(int32_t dim, uint32_t stride, float* matrix, double* half_inverted, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
ReflectFmatrix(dim, stride, matrix);
return InvertFmatrixFirstHalf(dim, stride, matrix, half_inverted, dbl_1d_buf, dbl_2d_buf);
}
HEADER_INLINE double HalfInvertedDet(__maybe_unused const double* half_inverted_iter, __maybe_unused const MatrixInvertBuf1* dbl_1d_buf, uint32_t dim) {
// singular values in dbl_1d_buf
double det_u = dbl_1d_buf[0];
for (uint32_t uii = 1; uii != dim; ++uii) {
det_u *= dbl_1d_buf[uii];
}
return fabs(det_u);
}
HEADER_INLINE double HalfSymmInvertedDet(__maybe_unused const double* half_inverted_iter, __maybe_unused const MatrixInvertBuf1* dbl_1d_buf, uint32_t dim, __maybe_unused uint32_t stride) {
return HalfInvertedDet(half_inverted_iter, dbl_1d_buf, dim);
}
void InvertStridedMatrixSecondHalf(__CLPK_integer dim, __CLPK_integer stride, double* inverted_result, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf);
HEADER_INLINE void InvertSymmdefMatrixSecondHalf(__CLPK_integer dim, __CLPK_integer stride, double* inverted_result, __maybe_unused MatrixInvertBuf1* dbl_1d_buf, __maybe_unused double* dbl_2d_buf) {
InvertStridedMatrixSecondHalf(dim, stride, inverted_result, dbl_1d_buf, dbl_2d_buf);
}
void InvertFmatrixSecondHalf(__CLPK_integer dim, uint32_t stride, double* half_inverted, float* inverted_result, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf);
HEADER_INLINE void InvertSymmdefFmatrixSecondHalf(__CLPK_integer dim, uint32_t stride, double* half_inverted, float* inverted_result, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
InvertFmatrixSecondHalf(dim, stride, half_inverted, inverted_result, dbl_1d_buf, dbl_2d_buf);
}
HEADER_INLINE BoolErr InvertSymmdefStridedMatrix(__CLPK_integer dim, __CLPK_integer stride, double* matrix, MatrixInvertBuf1* dbl_1d_buf, double* dbl_2d_buf) {
if (InvertSymmdefMatrixFirstHalf(dim, stride, matrix, dbl_1d_buf, dbl_2d_buf)) {
return 1;
}
InvertSymmdefMatrixSecondHalf(dim, stride, matrix, dbl_1d_buf, dbl_2d_buf);
return 0;
}
#else // !NOLAPACK
HEADER_INLINE double DotprodD(const double* vec1, const double* vec2, uint32_t ct) {
# ifndef USE_CBLAS_XGEMM
__CLPK_integer cti = ct;
__CLPK_integer incxy = 1;
return ddot_(&cti, K_CAST(double*, vec1), &incxy, K_CAST(double*, vec2), &incxy);
# else
return cblas_ddot(ct, vec1, 1, vec2, 1);
# endif
}
HEADER_INLINE double DotprodxD(const double* vec1, const double* vec2, uint32_t ct) {
if (ct > kDotprodDThresh) {
// best threshold is machine-dependent; this is what I got on my MacBook
return DotprodD(vec1, vec2, ct);
}
return DotprodDShort(vec1, vec2, ct);
}
// not worthwhile for ct < 16.
HEADER_INLINE float DotprodF(const float* vec1, const float* vec2, uint32_t ct) {
# ifndef USE_CBLAS_XGEMM
__CLPK_integer cti = ct;
__CLPK_integer incxy = 1;
return S_CAST(float, sdot_(&cti, K_CAST(float*, vec1), &incxy, K_CAST(float*, vec2), &incxy));
# else
return cblas_sdot(ct, vec1, 1, vec2, 1);
# endif
}
// extra if-statement in DotprodxF() seems disproportionally expensive in
// test?... guess I won't have auto-branch for now.
BoolErr InvertMatrix(__CLPK_integer dim, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
BoolErr InvertMatrixChecked(__CLPK_integer dim, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
// InvertSymmdef... functions only assume (C-order) lower left of matrix is
// filled, and only the lower left of the return matrix is valid.
BoolErr InvertSymmdefMatrix(__CLPK_integer dim, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
BoolErr InvertSymmdefStridedMatrix(__CLPK_integer dim, __CLPK_integer stride, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
// dbl_2d_buf must have room for at least max(dim, 3) * dim elements.
BoolErr InvertSymmdefMatrixChecked(__CLPK_integer dim, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
BoolErr InvertSymmdefMatrixFirstHalf(__CLPK_integer dim, __CLPK_integer stride, double* matrix, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
BoolErr InvertFmatrixFirstHalf(__CLPK_integer dim, uint32_t stride, const float* matrix, double* half_inverted, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
BoolErr InvertSymmdefFmatrixFirstHalf(__CLPK_integer dim, uint32_t stride, float* matrix, double* half_inverted, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
/*
HEADER_INLINE double HalfInvertedDet(__maybe_unused const double* half_inverted_iter, __maybe_unused const MatrixInvertBuf1* int_1d_buf, uint32_t dim) {
uint32_t dim_p1 = dim + 1;
double det_u = *half_inverted_iter;
for (uint32_t uii = 1; uii != dim; ++uii) {
half_inverted_iter = &(half_inverted_iter[dim_p1]);
det_u *= (*half_inverted_iter);
}
return fabs(det_u);
}
*/
HEADER_INLINE double HalfSymmInvertedDet(__maybe_unused const double* half_inverted_iter, __maybe_unused const MatrixInvertBuf1* int_1d_buf, uint32_t dim, uint32_t stride) {
uint32_t stride_p1 = stride + 1;
double sqrt_det_u = *half_inverted_iter;
for (uint32_t uii = 1; uii != dim; ++uii) {
half_inverted_iter = &(half_inverted_iter[stride_p1]);
sqrt_det_u *= (*half_inverted_iter);
}
return sqrt_det_u * sqrt_det_u;
}
HEADER_INLINE void InvertSymmdefMatrixSecondHalf(__CLPK_integer dim, __CLPK_integer stride, double* matrix, __maybe_unused MatrixInvertBuf1* int_1d_buf, __maybe_unused double* dbl_2d_buf) {
char uplo = 'U';
__CLPK_integer info;
dpotri_(&uplo, &dim, matrix, &stride, &info);
}
void InvertFmatrixSecondHalf(__CLPK_integer dim, uint32_t stride, double* half_inverted, float* inverted_result, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
void InvertSymmdefFmatrixSecondHalf(__CLPK_integer dim, uint32_t stride, double* half_inverted, float* inverted_result, MatrixInvertBuf1* int_1d_buf, double* dbl_2d_buf);
#endif // !NOLAPACK
void ColMajorMatrixMultiply(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix);
HEADER_INLINE void RowMajorMatrixMultiply(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix) {
return ColMajorMatrixMultiply(inmatrix2, inmatrix1, col2_ct, row1_ct, common_ct, outmatrix);
}
// this is essentially a full-blown dgemm wrapper, only missing the alpha
// parameter now
void ColMajorMatrixMultiplyStridedAddassign(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double beta, double* outmatrix);
HEADER_INLINE void ColMajorMatrixMultiplyStrided(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double* outmatrix) {
ColMajorMatrixMultiplyStridedAddassign(inmatrix1, inmatrix2, row1_ct, stride1, col2_ct, stride2, common_ct, stride3, 0.0, outmatrix);
}
HEADER_INLINE void RowMajorMatrixMultiplyIncr(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix) {
return ColMajorMatrixMultiplyStridedAddassign(inmatrix2, inmatrix1, col2_ct, col2_ct, row1_ct, common_ct, common_ct, col2_ct, 1.0, outmatrix);
}
HEADER_INLINE void RowMajorMatrixMultiplyStrided(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double* outmatrix) {
// stride1 should be close to common_ct
// stride2 should be close to col2_ct
// output matrix uses stride3, which should be close to col2_ct
return ColMajorMatrixMultiplyStridedAddassign(inmatrix2, inmatrix1, col2_ct, stride2, row1_ct, stride1, common_ct, stride3, 0.0, outmatrix);
}
HEADER_INLINE void RowMajorMatrixMultiplyStridedIncr(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double* outmatrix) {
return ColMajorMatrixMultiplyStridedAddassign(inmatrix2, inmatrix1, col2_ct, stride2, row1_ct, stride1, common_ct, stride3, 1.0, outmatrix);
}
// out^T := V^T * M
// out := M^T * V
void ColMajorVectorMatrixMultiplyStrided(const double* in_dvec1, const double* inmatrix2, __CLPK_integer common_ct, __CLPK_integer stride2, __CLPK_integer col2_ct, double* out_dvec);
// out := M * V
void ColMajorMatrixVectorMultiplyStrided(const double* inmatrix1, const double* in_dvec2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer common_ct, double* out_dvec);
void ColMajorFmatrixMultiplyStrided(const float* inmatrix1, const float* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, float* outmatrix);
HEADER_INLINE void RowMajorFmatrixMultiply(const float* inmatrix1, const float* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, float* outmatrix) {
ColMajorFmatrixMultiplyStrided(inmatrix2, inmatrix1, col2_ct, col2_ct, row1_ct, common_ct, common_ct, col2_ct, outmatrix);
}
// out := M * V
void ColMajorFmatrixVectorMultiplyStrided(const float* inmatrix1, const float* in_fvec2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer common_ct, float* out_fvec);
// out^T := V^T * M
// out := M^T * V
void ColMajorFvectorMatrixMultiplyStrided(const float* in_fvec1, const float* inmatrix2, __CLPK_integer common_ct, __CLPK_integer stride2, __CLPK_integer col2_ct, float* out_fvec);
void MatrixTransposeCopy(const double* old_matrix, uint32_t old_maj, uint32_t new_maj, double* new_matrix_iter);
void FmatrixTransposeCopy(const float* old_matrix, uint32_t old_maj, uint32_t new_maj, uint32_t new_maj_max, float* new_matrix_iter);
// A(A^T), where A is row-major; result is dim x dim
// ONLY UPDATES LOWER TRIANGLE OF result[].
void MultiplySelfTranspose(const double* input_matrix, uint32_t dim, uint32_t col_ct, double* result);
void MultiplySelfTransposeStrided(const double* input_matrix, uint32_t dim, uint32_t col_ct, uint32_t stride, double* result);
void MultiplySelfTransposeStridedF(const float* input_matrix, uint32_t dim, uint32_t col_ct, uint32_t stride, float* result);
// (A^T)A
void TransposeMultiplySelfIncr(double* input_part, uint32_t dim, uint32_t partial_row_ct, double* result);
#ifndef NOLAPACK
BoolErr GetSvdRectLwork(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer* lwork_ptr);
// currently a wrapper for dgesvd_().
IntErr SvdRect(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer lwork, double* matrix, double* ss, double* vv, unsigned char* svd_rect_wkspace);
HEADER_INLINE IntErr SvdRectFused(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer lwork, double* matrix, double* ss, unsigned char* svd_rect_wkspace) {
double* work = R_CAST(double*, svd_rect_wkspace);
double* vv_buf = &(work[lwork]);
return SvdRect(major_ct, minor_ct, lwork, matrix, ss, vv_buf, svd_rect_wkspace);
}
BoolErr GetExtractEigvecsLworks(uint32_t dim, uint32_t pc_ct, __CLPK_integer* lwork_ptr, __CLPK_integer* liwork_ptr, uintptr_t* wkspace_byte_ct_ptr);
// currently a wrapper for dsyevr_(). Matrix is expected to be
// lower-triangular from C's perspective.
// reverse_eigvecs is eigenvector-major, but the vectors are in order of
// *increasing* eigenvalue.
BoolErr ExtractEigvecs(uint32_t dim, uint32_t pc_ct, __CLPK_integer lwork, __CLPK_integer liwork, double* matrix, double* eigvals, double* reverse_eigvecs, unsigned char* extract_eigvecs_wkspace);
#endif
// Computes inverse of
// [ A b^T ]
// [ b c ]
// given precomputed A^{-1} (must be fully filled out, not just lower left).
// See e.g.
// https://en.wikipedia.org/wiki/Invertible_matrix#Blockwise_inversion .
// Only fills lower left of outmatrix.
// insert_idx specifies the zero-based row/column number of b/c in outmatrix.
BoolErr InvertRank1Symm(const double* a_inv, const double* bb, __CLPK_integer orig_dim, uint32_t insert_idx, double cc, double* __restrict outmatrix, double* __restrict ainv_b_buf);
// When you only need the diagonal from InvertRank1Symm(). insert_idx
// assumed to be orig_dim.
BoolErr InvertRank1SymmDiag(const double* a_inv, const double* bb, __CLPK_integer orig_dim, double cc, double* __restrict outdiag, double* __restrict ainv_b_buf);
// [ A B^T ]
// [ B D ]
// B is row-major.
BoolErr InvertRank2Symm(const double* a_inv, const double* bb, __CLPK_integer orig_dim, __CLPK_integer b_stride, uint32_t insert_idx, double d11, double d12, double d22, double* __restrict outmatrix, double* __restrict b_ainv_buf, double* __restrict s_b_ainv_buf);
BoolErr InvertRank2SymmDiag(const double* a_inv, const double* bb, __CLPK_integer orig_dim, double d11, double d12, double d22, double* __restrict outdiag, double* __restrict b_ainv_buf, double* __restrict s_b_ainv_buf);
#ifdef NOLAPACK
BoolErr LinearRegressionInvMain(const double* xt_y_phenomaj, uint32_t predictor_ct, uint32_t pheno_ct, double* xtx_inv, double* fitted_coefs_phenomaj, MatrixInvertBuf1* mi_buf, double* dbl_2d_buf);
#else
BoolErr LinearRegressionInvMain(const double* xt_y_phenomaj, uint32_t predictor_ct, __CLPK_integer pheno_ct, double* xtx_inv, double* fitted_coefs_phenomaj);
#endif
// now assumes xtx_inv is predictors_pmaj * transpose on input
HEADER_INLINE BoolErr LinearRegressionInv(const double* pheno_d_pmaj, const double* predictors_pmaj, uint32_t predictor_ct, uint32_t sample_ct, uint32_t pheno_ct, double* xtx_inv, double* fitted_coefs_phenomaj, double* xt_y_phenomaj, __maybe_unused MatrixInvertBuf1* mi_buf, __maybe_unused double* dbl_2d_buf) {
// MultiplySelfTranspose(predictors_pmaj, predictor_ct, sample_ct,
// xtx_inv);
// categorical optimization possible here
for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
RowMajorMatrixMultiply(predictors_pmaj, &(pheno_d_pmaj[pheno_idx * sample_ct]), predictor_ct, 1, sample_ct, &(xt_y_phenomaj[pheno_idx * predictor_ct]));
}
#ifdef NOLAPACK
return LinearRegressionInvMain(xt_y_phenomaj, predictor_ct, pheno_ct, xtx_inv, fitted_coefs_phenomaj, mi_buf, dbl_2d_buf);
#else
return LinearRegressionInvMain(xt_y_phenomaj, predictor_ct, pheno_ct, xtx_inv, fitted_coefs_phenomaj);
#endif
}
HEADER_INLINE BoolErr LinearRegressionDVec(const double* pheno_d, const double* predictors_pmaj, uint32_t predictor_ct, uint32_t sample_ct, double* fitted_coefs, double* xtx_inv_buf, double* xt_y_buf, __maybe_unused MatrixInvertBuf1* mi_buf, __maybe_unused double* dbl_2d_buf) {
// categorical optimization possible here
const uint32_t sample_ctav = RoundUpPow2(sample_ct, kDoublePerDVec);
ColMajorVectorMatrixMultiplyStrided(pheno_d, predictors_pmaj, sample_ct, sample_ctav, predictor_ct, xt_y_buf);
MultiplySelfTransposeStrided(predictors_pmaj, predictor_ct, sample_ct, sample_ctav, xtx_inv_buf);
#ifdef NOLAPACK
return LinearRegressionInvMain(xt_y_buf, predictor_ct, 1, xtx_inv_buf, fitted_coefs, mi_buf, dbl_2d_buf);
#else
return LinearRegressionInvMain(xt_y_buf, predictor_ct, 1, xtx_inv_buf, fitted_coefs);
#endif
}
// just for debugging
HEADER_INLINE void PrintVector(const double* vec, uintptr_t ct) {
printf("%g", vec[0]);
for (uintptr_t ulii = 1; ulii != ct; ++ulii) {
printf(" %g", vec[ulii]);
}
printf("\n");
}
HEADER_INLINE void PrintFvector(const float* vec, uintptr_t ct) {
printf("%g", S_CAST(double, vec[0]));
for (uintptr_t ulii = 1; ulii != ct; ++ulii) {
printf(" %g", S_CAST(double, vec[ulii]));
}
printf("\n");
}
HEADER_INLINE void PrintSymmMatrix(const double* matrix, uint32_t dim) {
for (uint32_t uii = 0; uii != dim; ++uii) {
printf("%g", matrix[uii * dim]);
for (uint32_t ujj = 1; ujj <= uii; ++ujj) {
printf(" %g", matrix[uii * dim + ujj]);
}
printf("\n");
}
}
HEADER_INLINE void PrintSymmFmatrix(const float* matrix, uint32_t dim) {
for (uint32_t uii = 0; uii != dim; ++uii) {
printf("%g", S_CAST(double, matrix[uii * dim]));
for (uint32_t ujj = 1; ujj <= uii; ++ujj) {
printf(" %g", S_CAST(double, matrix[uii * dim + ujj]));
}
printf("\n");
}
}
HEADER_INLINE void PrintMatrix(const double* matrix, uintptr_t row_ct, uintptr_t col_ct) {
for (uintptr_t ulii = 0; ulii != row_ct; ++ulii) {
printf("%g", matrix[ulii * col_ct]);
for (uintptr_t uljj = 1; uljj != col_ct; ++uljj) {
printf(" %g", matrix[ulii * col_ct + uljj]);
}
printf("\n");
}
}
HEADER_INLINE void PrintStridedMatrix(const double* matrix, uintptr_t row_ct, uintptr_t col_ct, uintptr_t stride) {
for (uintptr_t ulii = 0; ulii != row_ct; ++ulii) {
printf("%g", matrix[ulii * stride]);
for (uintptr_t uljj = 1; uljj != col_ct; ++uljj) {
printf(" %g", matrix[ulii * stride + uljj]);
}
printf("\n");
}
}
HEADER_INLINE void PrintFmatrix(const float* matrix, uintptr_t row_ct, uintptr_t col_ct, uintptr_t stride) {
for (uintptr_t ulii = 0; ulii != row_ct; ++ulii) {
printf("%g", S_CAST(double, matrix[ulii * stride]));
for (uintptr_t uljj = 1; uljj != col_ct; ++uljj) {
printf(" %g", S_CAST(double, matrix[ulii * stride + uljj]));
}
printf("\n");
}
}
#ifdef __cplusplus
} // namespace plink2
#endif
#endif // __PLINK2_MATRIX_H__
|