File: dgeqp3.go

package info (click to toggle)
golang-gonum-v1-gonum 0.15.1-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 18,792 kB
sloc: asm: 6,252; fortran: 5,271; sh: 377; ruby: 211; makefile: 98
file content (195 lines) | stat: -rw-r--r-- 5,052 bytes
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package gonum

import (
	"gonum.org/v1/gonum/blas"
	"gonum.org/v1/gonum/blas/blas64"
)

// Dgeqp3 computes a QR factorization with column pivoting of the m×n matrix A:
//
//	A*P = Q*R
//
// where P is a permutation matrix, Q is an orthogonal matrix and R is a
// min(m,n)×n upper trapezoidal matrix.
//
// On return, the upper triangle of A contains the matrix R. The elements below
// the diagonal together with tau represent the matrix Q as a product of
// elementary reflectors
//
//	Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
//
// Each H_i has the form
//
//	H_i = I - tau * v * vᵀ
//
// where tau is a scalar and v is a vector with v[0:i] = 0 and v[i] = 1;
// v[i+1:m] is stored on exit in A[i+1:m,i], and tau in tau[i].
//
// jpvt specifies a column pivot to be applied to A. On entry, if jpvt[j] is at
// least zero, the jth column of A is permuted to the front of A*P (a leading
// column), if jpvt[j] is -1 the jth column of A is a free column. If jpvt[j] <
// -1, Dgeqp3 will panic. On return, jpvt holds the permutation that was
// applied; the jth column of A*P was the jpvt[j] column of A. jpvt must have
// length n or Dgeqp3 will panic.
//
// tau holds the scalar factors of the elementary reflectors. It must have
// length min(m,n), otherwise Dgeqp3 will panic.
//
// work must have length at least max(1,lwork), and lwork must be at least
// 3*n+1, otherwise Dgeqp3 will panic. For optimal performance lwork must be at
// least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return, work[0]
// will contain the optimal value of lwork.
//
// If lwork == -1, instead of performing Dgeqp3, only the optimal value of lwork
// will be stored in work[0].
//
// Dgeqp3 is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int) {
	const (
		inb    = 1
		inbmin = 2
		ixover = 3
	)

	minmn := min(m, n)
	iws := 3*n + 1
	if minmn == 0 {
		iws = 1
	}
	switch {
	case m < 0:
		panic(mLT0)
	case n < 0:
		panic(nLT0)
	case lda < max(1, n):
		panic(badLdA)
	case lwork < iws && lwork != -1:
		panic(badLWork)
	case len(work) < max(1, lwork):
		panic(shortWork)
	}

	// Quick return if possible.
	if minmn == 0 {
		work[0] = 1
		return
	}

	nb := impl.Ilaenv(inb, "DGEQRF", " ", m, n, -1, -1)
	if lwork == -1 {
		work[0] = float64(2*n + (n+1)*nb)
		return
	}

	switch {
	case len(a) < (m-1)*lda+n:
		panic(shortA)
	case len(jpvt) != n:
		panic(badLenJpvt)
	case len(tau) < minmn:
		panic(shortTau)
	}

	for _, v := range jpvt {
		if v < -1 || n <= v {
			panic(badJpvt)
		}
	}

	bi := blas64.Implementation()

	// Move initial columns up front.
	var nfxd int
	for j := 0; j < n; j++ {
		if jpvt[j] == -1 {
			jpvt[j] = j
			continue
		}
		if j != nfxd {
			bi.Dswap(m, a[j:], lda, a[nfxd:], lda)
			jpvt[j], jpvt[nfxd] = jpvt[nfxd], j
		} else {
			jpvt[j] = j
		}
		nfxd++
	}

	// Factorize nfxd columns.
	//
	// Compute the QR factorization of nfxd columns and update remaining columns.
	if nfxd > 0 {
		na := min(m, nfxd)
		impl.Dgeqrf(m, na, a, lda, tau[:na], work, lwork)
		iws = max(iws, int(work[0]))
		if na < n {
			impl.Dormqr(blas.Left, blas.Trans, m, n-na, na, a, lda, tau[:na], a[na:], lda,
				work, lwork)
			iws = max(iws, int(work[0]))
		}
	}

	if nfxd >= minmn {
		work[0] = float64(iws)
		return
	}

	// Factorize free columns.
	sm := m - nfxd
	sn := n - nfxd
	sminmn := minmn - nfxd

	// Determine the block size.
	nb = impl.Ilaenv(inb, "DGEQRF", " ", sm, sn, -1, -1)
	nbmin := 2
	nx := 0

	if 1 < nb && nb < sminmn {
		// Determine when to cross over from blocked to unblocked code.
		nx = max(0, impl.Ilaenv(ixover, "DGEQRF", " ", sm, sn, -1, -1))

		if nx < sminmn {
			// Determine if workspace is large enough for blocked code.
			minws := 2*sn + (sn+1)*nb
			iws = max(iws, minws)
			if lwork < minws {
				// Not enough workspace to use optimal nb. Reduce
				// nb and determine the minimum value of nb.
				nb = (lwork - 2*sn) / (sn + 1)
				nbmin = max(2, impl.Ilaenv(inbmin, "DGEQRF", " ", sm, sn, -1, -1))
			}
		}
	}

	// Initialize partial column norms.
	// The first n elements of work store the exact column norms.
	for j := nfxd; j < n; j++ {
		work[j] = bi.Dnrm2(sm, a[nfxd*lda+j:], lda)
		work[n+j] = work[j]
	}
	j := nfxd
	if nbmin <= nb && nb < sminmn && nx < sminmn {
		// Use blocked code initially.

		// Compute factorization.
		var fjb int
		for topbmn := minmn - nx; j < topbmn; j += fjb {
			jb := min(nb, topbmn-j)

			// Factorize jb columns among columns j:n.
			fjb = impl.Dlaqps(m, n-j, j, jb, a[j:], lda, jpvt[j:], tau[j:],
				work[j:n], work[j+n:2*n], work[2*n:2*n+jb], work[2*n+jb:], jb)
		}
	}

	// Use unblocked code to factor the last or only block.
	if j < minmn {
		impl.Dlaqp2(m, n-j, j, a[j:], lda, jpvt[j:], tau[j:],
			work[j:n], work[j+n:2*n], work[2*n:])
	}

	work[0] = float64(iws)
}