1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
|
// Copyright ©2014 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package optimize
import (
"math"
"gonum.org/v1/gonum/floats"
)
const (
iterationRestartFactor = 6
angleRestartThreshold = -0.9
)
var (
_ Method = (*CG)(nil)
_ localMethod = (*CG)(nil)
_ NextDirectioner = (*CG)(nil)
)
// CGVariant calculates the scaling parameter, β, used for updating the
// conjugate direction in the nonlinear conjugate gradient (CG) method.
type CGVariant interface {
// Init is called at the first iteration and provides a way to initialize
// any internal state.
Init(loc *Location)
// Beta returns the value of the scaling parameter that is computed
// according to the particular variant of the CG method.
Beta(grad, gradPrev, dirPrev []float64) float64
}
var (
_ CGVariant = (*FletcherReeves)(nil)
_ CGVariant = (*PolakRibierePolyak)(nil)
_ CGVariant = (*HestenesStiefel)(nil)
_ CGVariant = (*DaiYuan)(nil)
_ CGVariant = (*HagerZhang)(nil)
)
// CG implements the nonlinear conjugate gradient method for solving nonlinear
// unconstrained optimization problems. It is a line search method that
// generates the search directions d_k according to the formula
//
// d_{k+1} = -∇f_{k+1} + β_k*d_k, d_0 = -∇f_0.
//
// Variants of the conjugate gradient method differ in the choice of the
// parameter β_k. The conjugate gradient method usually requires fewer function
// evaluations than the gradient descent method and no matrix storage, but
// L-BFGS is usually more efficient.
//
// CG implements a restart strategy that takes the steepest descent direction
// (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds:
//
// - A certain number of iterations has elapsed without a restart. This number
// is controllable via IterationRestartFactor and if equal to 0, it is set to
// a reasonable default based on the problem dimension.
// - The angle between the gradients at two consecutive iterations ∇f_k and
// ∇f_{k+1} is too large.
// - The direction d_{k+1} is not a descent direction.
// - β_k returned from CGVariant.Beta is equal to zero.
//
// The line search for CG must yield step sizes that satisfy the strong Wolfe
// conditions at every iteration, otherwise the generated search direction
// might fail to be a descent direction. The line search should be more
// stringent compared with those for Newton-like methods, which can be achieved
// by setting the gradient constant in the strong Wolfe conditions to a small
// value.
//
// See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate
// gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and
// references therein.
type CG struct {
// Linesearcher must satisfy the strong Wolfe conditions at every iteration.
// If Linesearcher == nil, an appropriate default is chosen.
Linesearcher Linesearcher
// Variant implements the particular CG formula for computing β_k.
// If Variant is nil, an appropriate default is chosen.
Variant CGVariant
// InitialStep estimates the initial line search step size, because the CG
// method does not generate well-scaled search directions.
// If InitialStep is nil, an appropriate default is chosen.
InitialStep StepSizer
// IterationRestartFactor determines the frequency of restarts based on the
// problem dimension. The negative gradient direction is taken whenever
// ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed
// without a restart. For medium and large-scale problems
// IterationRestartFactor should be set to 1, low-dimensional problems a
// larger value should be chosen. Note that if the ceil function returns 1,
// CG will be identical to gradient descent.
// If IterationRestartFactor is 0, it will be set to 6.
// CG will panic if IterationRestartFactor is negative.
IterationRestartFactor float64
// AngleRestartThreshold sets the threshold angle for restart. The method
// is restarted if the cosine of the angle between two consecutive
// gradients is smaller than or equal to AngleRestartThreshold, that is, if
// ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold.
// A value of AngleRestartThreshold closer to -1 (successive gradients in
// exact opposite directions) will tend to reduce the number of restarts.
// If AngleRestartThreshold is 0, it will be set to -0.9.
// CG will panic if AngleRestartThreshold is not in the interval [-1, 0].
AngleRestartThreshold float64
// GradStopThreshold sets the threshold for stopping if the gradient norm
// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
// if it is NaN the setting is not used.
GradStopThreshold float64
ls *LinesearchMethod
status Status
err error
restartAfter int
iterFromRestart int
dirPrev []float64
gradPrev []float64
gradPrevNorm float64
}
func (cg *CG) Status() (Status, error) {
return cg.status, cg.err
}
func (*CG) Uses(has Available) (uses Available, err error) {
return has.gradient()
}
func (cg *CG) Init(dim, tasks int) int {
cg.status = NotTerminated
cg.err = nil
return 1
}
func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks)
close(operation)
}
func (cg *CG) initLocal(loc *Location) (Operation, error) {
if cg.IterationRestartFactor < 0 {
panic("cg: IterationRestartFactor is negative")
}
if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 {
panic("cg: AngleRestartThreshold not in [-1, 0]")
}
if cg.Linesearcher == nil {
cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1}
}
if cg.Variant == nil {
cg.Variant = &HestenesStiefel{}
}
if cg.InitialStep == nil {
cg.InitialStep = &FirstOrderStepSize{}
}
if cg.IterationRestartFactor == 0 {
cg.IterationRestartFactor = iterationRestartFactor
}
if cg.AngleRestartThreshold == 0 {
cg.AngleRestartThreshold = angleRestartThreshold
}
if cg.ls == nil {
cg.ls = &LinesearchMethod{}
}
cg.ls.Linesearcher = cg.Linesearcher
cg.ls.NextDirectioner = cg
return cg.ls.Init(loc)
}
func (cg *CG) iterateLocal(loc *Location) (Operation, error) {
return cg.ls.Iterate(loc)
}
func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) {
dim := len(loc.X)
cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim)))
cg.iterFromRestart = 0
// The initial direction is always the negative gradient.
copy(dir, loc.Gradient)
floats.Scale(-1, dir)
cg.dirPrev = resize(cg.dirPrev, dim)
copy(cg.dirPrev, dir)
cg.gradPrev = resize(cg.gradPrev, dim)
copy(cg.gradPrev, loc.Gradient)
cg.gradPrevNorm = floats.Norm(loc.Gradient, 2)
cg.Variant.Init(loc)
return cg.InitialStep.Init(loc, dir)
}
func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) {
copy(dir, loc.Gradient)
floats.Scale(-1, dir)
cg.iterFromRestart++
var restart bool
if cg.iterFromRestart == cg.restartAfter {
// Restart because too many iterations have been taken without a restart.
restart = true
}
gDot := floats.Dot(loc.Gradient, cg.gradPrev)
gNorm := floats.Norm(loc.Gradient, 2)
if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm {
// Restart because the angle between the last two gradients is too large.
restart = true
}
// Compute the scaling factor β_k even when restarting, because cg.Variant
// may be keeping an inner state that needs to be updated at every iteration.
beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev)
if beta == 0 {
// β_k == 0 means that the steepest descent direction will be taken, so
// indicate that the method is in fact being restarted.
restart = true
}
if !restart {
// The method is not being restarted, so update the descent direction.
floats.AddScaled(dir, beta, cg.dirPrev)
if floats.Dot(loc.Gradient, dir) >= 0 {
// Restart because the new direction is not a descent direction.
restart = true
copy(dir, loc.Gradient)
floats.Scale(-1, dir)
}
}
// Get the initial line search step size from the StepSizer even if the
// method was restarted, because StepSizers need to see every iteration.
stepSize = cg.InitialStep.StepSize(loc, dir)
if restart {
// The method was restarted and since the steepest descent direction is
// not related to the previous direction, discard the estimated step
// size from cg.InitialStep and use step size of 1 instead.
stepSize = 1
// Reset to 0 the counter of iterations taken since the last restart.
cg.iterFromRestart = 0
}
copy(cg.gradPrev, loc.Gradient)
copy(cg.dirPrev, dir)
cg.gradPrevNorm = gNorm
return stepSize
}
func (*CG) needs() struct {
Gradient bool
Hessian bool
} {
return struct {
Gradient bool
Hessian bool
}{true, false}
}
// FletcherReeves implements the Fletcher-Reeves variant of the CG method that
// computes the scaling parameter β_k according to the formula
//
// β_k = |∇f_{k+1}|^2 / |∇f_k|^2.
type FletcherReeves struct {
prevNorm float64
}
func (fr *FletcherReeves) Init(loc *Location) {
fr.prevNorm = floats.Norm(loc.Gradient, 2)
}
func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) {
norm := floats.Norm(grad, 2)
beta = (norm / fr.prevNorm) * (norm / fr.prevNorm)
fr.prevNorm = norm
return beta
}
// PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG
// method that computes the scaling parameter β_k according to the formula
//
// β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2),
//
// where y_k = ∇f_{k+1} - ∇f_k.
type PolakRibierePolyak struct {
prevNorm float64
}
func (pr *PolakRibierePolyak) Init(loc *Location) {
pr.prevNorm = floats.Norm(loc.Gradient, 2)
}
func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) {
norm := floats.Norm(grad, 2)
dot := floats.Dot(grad, gradPrev)
beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm)
pr.prevNorm = norm
return math.Max(0, beta)
}
// HestenesStiefel implements the Hestenes-Stiefel variant of the CG method
// that computes the scaling parameter β_k according to the formula
//
// β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k),
//
// where y_k = ∇f_{k+1} - ∇f_k.
type HestenesStiefel struct {
y []float64
}
func (hs *HestenesStiefel) Init(loc *Location) {
hs.y = resize(hs.y, len(loc.Gradient))
}
func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
floats.SubTo(hs.y, grad, gradPrev)
beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y)
return math.Max(0, beta)
}
// DaiYuan implements the Dai-Yuan variant of the CG method that computes the
// scaling parameter β_k according to the formula
//
// β_k = |∇f_{k+1}|^2 / d_k·y_k,
//
// where y_k = ∇f_{k+1} - ∇f_k.
type DaiYuan struct {
y []float64
}
func (dy *DaiYuan) Init(loc *Location) {
dy.y = resize(dy.y, len(loc.Gradient))
}
func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
floats.SubTo(dy.y, grad, gradPrev)
norm := floats.Norm(grad, 2)
return norm * norm / floats.Dot(dirPrev, dy.y)
}
// HagerZhang implements the Hager-Zhang variant of the CG method that computes the
// scaling parameter β_k according to the formula
//
// β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k),
//
// where y_k = ∇f_{k+1} - ∇f_k.
type HagerZhang struct {
y []float64
}
func (hz *HagerZhang) Init(loc *Location) {
hz.y = resize(hz.y, len(loc.Gradient))
}
func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
floats.SubTo(hz.y, grad, gradPrev)
dirDotY := floats.Dot(dirPrev, hz.y)
gDotY := floats.Dot(grad, hz.y)
gDotDir := floats.Dot(grad, dirPrev)
yNorm := floats.Norm(hz.y, 2)
return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY
}
|