File: LineSearchMoreThuente.h

package info (click to toggle)
lbfgspp 0.4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 420 kB
  • sloc: cpp: 2,225; sh: 19; makefile: 4
file content (444 lines) | stat: -rw-r--r-- 19,296 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
// Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
// Under MIT license

#ifndef LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
#define LBFGSPP_LINE_SEARCH_MORE_THUENTE_H

#include <Eigen/Core>
#include <stdexcept>  // std::invalid_argument, std::runtime_error
#include "Param.h"

namespace LBFGSpp {

///
/// The line search algorithm by Moré and Thuente (1994), currently used for the L-BFGS-B algorithm.
///
/// The target of this line search algorithm is to find a step size \f$\alpha\f$ that satisfies the strong Wolfe condition
/// \f$f(x+\alpha d) \le f(x) + \alpha\mu g(x)^T d\f$ and \f$|g(x+\alpha d)^T d| \le \eta|g(x)^T d|\f$.
/// Our implementation is a simplified version of the algorithm in [1]. We assume that \f$0<\mu<\eta<1\f$, while in [1]
/// they do not assume \f$\eta>\mu\f$. As a result, the algorithm in [1] has two stages, but in our implementation we
/// only need the first stage to guarantee the convergence.
///
/// Reference:
/// [1] Moré, J. J., & Thuente, D. J. (1994). Line search algorithms with guaranteed sufficient decrease.
///
template <typename Scalar>
class LineSearchMoreThuente
{
private:
    using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;

    // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
    // that interpolates fa, ga, and fb, assuming the minimizer exists
    // For case I: fb >= fa and ga * (b - a) < 0
    static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& ga, const Scalar& fb)
    {
        const Scalar ba = b - a;
        const Scalar w = Scalar(0.5) * ba * ga / (fa - fb + ba * ga);
        return a + w * ba;
    }

    // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
    // that interpolates fa, ga and gb, assuming the minimizer exists
    // The result actually does not depend on fa
    // For case II: ga * (b - a) < 0, ga * gb < 0
    // For case III: ga * (b - a) < 0, ga * ga >= 0, |gb| <= |ga|
    static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& ga, const Scalar& gb)
    {
        const Scalar w = ga / (ga - gb);
        return a + w * (b - a);
    }

    // Local minimizer of a cubic function q(x) = c0 + c1 * x + c2 * x^2 + c3 * x^3
    // that interpolates fa, ga, fb and gb, assuming a != b
    // Also sets a flag indicating whether the minimizer exists
    static Scalar cubic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& fb,
                                  const Scalar& ga, const Scalar& gb, bool& exists)
    {
        using std::abs;
        using std::sqrt;

        const Scalar apb = a + b;
        const Scalar ba = b - a;
        const Scalar ba2 = ba * ba;
        const Scalar fba = fb - fa;
        const Scalar gba = gb - ga;
        // z3 = c3 * (b-a)^3, z2 = c2 * (b-a)^3, z1 = c1 * (b-a)^3
        const Scalar z3 = (ga + gb) * ba - Scalar(2) * fba;
        const Scalar z2 = Scalar(0.5) * (gba * ba2 - Scalar(3) * apb * z3);
        const Scalar z1 = fba * ba2 - apb * z2 - (a * apb + b * b) * z3;
        // std::cout << "z1 = " << z1 << ", z2 = " << z2 << ", z3 = " << z3 << std::endl;

        // If c3 = z/(b-a)^3 == 0, reduce to quadratic problem
        const Scalar eps = std::numeric_limits<Scalar>::epsilon();
        if (abs(z3) < eps * abs(z2) || abs(z3) < eps * abs(z1))
        {
            // Minimizer exists if c2 > 0
            exists = (z2 * ba > Scalar(0));
            // Return the end point if the minimizer does not exist
            return exists ? (-Scalar(0.5) * z1 / z2) : b;
        }

        // Now we can assume z3 > 0
        // The minimizer is a solution to the equation c1 + 2*c2 * x + 3*c3 * x^2 = 0
        // roots = -(z2/z3) / 3 (+-) sqrt((z2/z3)^2 - 3 * (z1/z3)) / 3
        //
        // Let u = z2/(3z3) and v = z1/z2
        // The minimizer exists if v/u <= 1
        const Scalar u = z2 / (Scalar(3) * z3), v = z1 / z2;
        const Scalar vu = v / u;
        exists = (vu <= Scalar(1));
        if (!exists)
            return b;

        // We need to find a numerically stable way to compute the roots, as z3 may still be small
        //
        // If |u| >= |v|, let w = 1 + sqrt(1-v/u), and then
        // r1 = -u * w, r2 = -v / w, r1 does not need to be the smaller one
        //
        // If |u| < |v|, we must have uv <= 0, and then
        // r = -u (+-) sqrt(delta), where
        // sqrt(delta) = sqrt(|u|) * sqrt(|v|) * sqrt(1-u/v)
        Scalar r1 = Scalar(0), r2 = Scalar(0);
        if (abs(u) >= abs(v))
        {
            const Scalar w = Scalar(1) + sqrt(Scalar(1) - vu);
            r1 = -u * w;
            r2 = -v / w;
        }
        else
        {
            const Scalar sqrtd = sqrt(abs(u)) * sqrt(abs(v)) * sqrt(1 - u / v);
            r1 = -u - sqrtd;
            r2 = -u + sqrtd;
        }
        return (z3 * ba > Scalar(0)) ? ((std::max)(r1, r2)) : ((std::min)(r1, r2));
    }

    // Select the next step size according to the current step sizes,
    // function values, and derivatives
    static Scalar step_selection(
        const Scalar& al, const Scalar& au, const Scalar& at,
        const Scalar& fl, const Scalar& fu, const Scalar& ft,
        const Scalar& gl, const Scalar& gu, const Scalar& gt)
    {
        using std::abs;

        if (al == au)
            return al;

        // If ft = Inf or gt = Inf, we return the middle point of al and at
        if (!std::isfinite(ft) || !std::isfinite(gt))
            return (al + at) / Scalar(2);

        // ac: cubic interpolation of fl, ft, gl, gt
        // aq: quadratic interpolation of fl, gl, ft
        bool ac_exists;
        // std::cout << "al = " << al << ", at = " << at << ", fl = " << fl << ", ft = " << ft << ", gl = " << gl << ", gt = " << gt << std::endl;
        const Scalar ac = cubic_minimizer(al, at, fl, ft, gl, gt, ac_exists);
        const Scalar aq = quadratic_minimizer(al, at, fl, gl, ft);
        // std::cout << "ac = " << ac << ", aq = " << aq << std::endl;
        // Case 1: ft > fl
        if (ft > fl)
        {
            // This should not happen if ft > fl, but just to be safe
            if (!ac_exists)
                return aq;
            // Then use the scheme described in the paper
            return (abs(ac - al) < abs(aq - al)) ? ac : ((aq + ac) / Scalar(2));
        }

        // as: quadratic interpolation of gl and gt
        const Scalar as = quadratic_minimizer(al, at, gl, gt);
        // Case 2: ft <= fl, gt * gl < 0
        if (gt * gl < Scalar(0))
            return (abs(ac - at) >= abs(as - at)) ? ac : as;

        // Case 3: ft <= fl, gt * gl >= 0, |gt| < |gl|
        const Scalar deltal = Scalar(1.1), deltau = Scalar(0.66);
        if (abs(gt) < abs(gl))
        {
            // We choose either ac or as
            // The case for ac: 1. It exists, and
            //                  2. ac is farther than at from al, and
            //                  3. ac is closer to at than as
            // Cases for as: otherwise
            const Scalar res = (ac_exists &&
                                (ac - at) * (at - al) > Scalar(0) &&
                                abs(ac - at) < abs(as - at)) ?
                ac :
                as;
            // Postprocessing the chosen step
            return (at > al) ?
                std::min(at + deltau * (au - at), res) :
                std::max(at + deltau * (au - at), res);
        }

        // Simple extrapolation if au, fu, or gu is infinity
        if ((!std::isfinite(au)) || (!std::isfinite(fu)) || (!std::isfinite(gu)))
            return at + deltal * (at - al);

        // ae: cubic interpolation of ft, fu, gt, gu
        bool ae_exists;
        const Scalar ae = cubic_minimizer(at, au, ft, fu, gt, gu, ae_exists);
        // Case 4: ft <= fl, gt * gl >= 0, |gt| >= |gl|
        // The following is not used in the paper, but it seems to be a reasonable safeguard
        return (at > al) ?
            std::min(at + deltau * (au - at), ae) :
            std::max(at + deltau * (au - at), ae);
    }

public:
    ///
    /// Line search by Moré and Thuente (1994).
    ///
    /// \param f        A function object such that `f(x, grad)` returns the
    ///                 objective function value at `x`, and overwrites `grad` with
    ///                 the gradient.
    /// \param param    An `LBFGSParam` or `LBFGSBParam` object that stores the
    ///                 parameters of the solver.
    /// \param xp       The current point.
    /// \param drt      The current moving direction.
    /// \param step_max The upper bound for the step size that makes x feasible.
    /// \param step     In: The initial step length.
    ///                 Out: The calculated step length.
    /// \param fx       In: The objective function value at the current point.
    ///                 Out: The function value at the new point.
    /// \param grad     In: The current gradient vector.
    ///                 Out: The gradient at the new point.
    /// \param dg       In: The inner product between drt and grad.
    ///                 Out: The inner product between drt and the new gradient.
    /// \param x        Out: The new point moved to.
    ///
    template <typename Foo, typename SolverParam>
    static void LineSearch(Foo& f, const SolverParam& param,
                           const Vector& xp, const Vector& drt, const Scalar& step_max,
                           Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
    {
        using std::abs;
        // std::cout << "========================= Entering line search =========================\n\n";

        // Check the value of step
        if (step <= Scalar(0))
            throw std::invalid_argument("'step' must be positive");
        if (step > step_max)
            throw std::invalid_argument("'step' exceeds 'step_max'");

        // Save the function value at the current x
        const Scalar fx_init = fx;
        // Projection of gradient on the search direction
        const Scalar dg_init = dg;

        // std::cout << "fx_init = " << fx_init << ", dg_init = " << dg_init << std::endl << std::endl;

        // Make sure d points to a descent direction
        if (dg_init >= Scalar(0))
            throw std::logic_error("the moving direction does not decrease the objective function value");

        // Tolerance for convergence test
        // Sufficient decrease
        const Scalar test_decr = param.ftol * dg_init;
        // Curvature
        const Scalar test_curv = -param.wolfe * dg_init;

        // The bracketing interval
        Scalar I_lo = Scalar(0), I_hi = std::numeric_limits<Scalar>::infinity();
        Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits<Scalar>::infinity();
        Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits<Scalar>::infinity();
        // We also need to save x and grad for step=I_lo, since we want to return the best
        // step size along the path when strong Wolfe condition is not met
        Vector x_lo = xp, grad_lo = grad;
        Scalar fx_lo = fx_init, dg_lo = dg_init;

        // Function value and gradient at the current step size
        x.noalias() = xp + step * drt;
        fx = f(x, grad);
        dg = grad.dot(drt);

        // std::cout << "max_step = " << step_max << ", step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;

        // Convergence test
        if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
        {
            // std::cout << "** Criteria met\n\n";
            // std::cout << "========================= Leaving line search =========================\n\n";
            return;
        }

        // Extrapolation factor
        const Scalar delta = Scalar(1.1);
        int iter;
        for (iter = 0; iter < param.max_linesearch; iter++)
        {
            // ft = psi(step) = f(xp + step * drt) - f(xp) - step * test_decr
            // gt = psi'(step) = dg - mu * dg_init
            // mu = param.ftol
            const Scalar ft = fx - fx_init - step * test_decr;
            const Scalar gt = dg - param.ftol * dg_init;

            // Update step size and bracketing interval
            Scalar new_step;
            if (ft > fI_lo)
            {
                // Case 1: ft > fl
                new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
                // Sanity check: if the computed new_step is too small, typically due to
                // extremely large value of ft, switch to the middle point
                if (new_step <= param.min_step)
                    new_step = (I_lo + step) / Scalar(2);

                I_hi = step;
                fI_hi = ft;
                gI_hi = gt;

                // std::cout << "Case 1: new step = " << new_step << std::endl;
            }
            else if (gt * (I_lo - step) > Scalar(0))
            {
                // Case 2: ft <= fl, gt * (al - at) > 0
                //
                // Page 291 of Moré and Thuente (1994) suggests that
                // newat = min(at + delta * (at - al), amax), delta in [1.1, 4]
                new_step = std::min(step_max, step + delta * (step - I_lo));

                // We can also consider the following scheme:
                // First let step_selection() decide a value, and then project to the range above
                //
                // new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
                // const Scalar delta2 = Scalar(4)
                // const Scalar t1 = step + delta * (step - I_lo);
                // const Scalar t2 = step + delta2 * (step - I_lo);
                // const Scalar tl = std::min(t1, t2), tu = std::max(t1, t2);
                // new_step = std::min(tu, std::max(tl, new_step));
                // new_step = std::min(step_max, new_step);

                I_lo = step;
                fI_lo = ft;
                gI_lo = gt;
                // Move x and grad to x_lo and grad_lo, respectively
                x_lo.swap(x);
                grad_lo.swap(grad);
                fx_lo = fx;
                dg_lo = dg;

                // std::cout << "Case 2: new step = " << new_step << std::endl;
            }
            else
            {
                // Case 3: ft <= fl, gt * (al - at) <= 0
                new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);

                I_hi = I_lo;
                fI_hi = fI_lo;
                gI_hi = gI_lo;

                I_lo = step;
                fI_lo = ft;
                gI_lo = gt;
                // Move x and grad to x_lo and grad_lo, respectively
                x_lo.swap(x);
                grad_lo.swap(grad);
                fx_lo = fx;
                dg_lo = dg;

                // std::cout << "Case 3: new step = " << new_step << std::endl;
            }

            // Case 1 and 3 are interpolations, whereas Case 2 is extrapolation
            // This means that Case 2 may return new_step = step_max,
            // and we need to decide whether to accept this value
            // 1. If both step and new_step equal to step_max, it means
            //    step will have no further change, so we accept it
            // 2. Otherwise, we need to test the function value and gradient
            //    on step_max, and decide later

            // In case step, new_step, and step_max are equal, directly return the computed x and fx
            if (step == step_max && new_step >= step_max)
            {
                // std::cout << "** Maximum step size reached\n\n";
                // std::cout << "========================= Leaving line search =========================\n\n";

                // Move {x, grad}_lo back before returning
                x.swap(x_lo);
                grad.swap(grad_lo);
                return;
            }
            // Otherwise, recompute x and fx based on new_step
            step = new_step;

            if (step < param.min_step)
                throw std::runtime_error("the line search step became smaller than the minimum value allowed");

            if (step > param.max_step)
                throw std::runtime_error("the line search step became larger than the maximum value allowed");

            // Update parameter, function value, and gradient
            x.noalias() = xp + step * drt;
            fx = f(x, grad);
            dg = grad.dot(drt);

            // std::cout << "step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;

            // Convergence test
            if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
            {
                // std::cout << "** Criteria met\n\n";
                // std::cout << "========================= Leaving line search =========================\n\n";
                return;
            }

            // Now assume step = step_max, and we need to decide whether to
            // exit the line search (see the comments above regarding step_max)
            // If we reach here, it means this step size does not pass the convergence
            // test, so either the sufficient decrease condition or the curvature
            // condition is not met yet
            //
            // Typically the curvature condition is harder to meet, and it is
            // possible that no step size in [0, step_max] satisfies the condition
            //
            // But we need to make sure that its psi function value is smaller than
            // the best one so far. If not, go to the next iteration and find a better one
            if (step >= step_max)
            {
                const Scalar ft_bound = fx - fx_init - step * test_decr;
                if (ft_bound <= fI_lo)
                {
                    // std::cout << "** Maximum step size reached\n\n";
                    // std::cout << "========================= Leaving line search =========================\n\n";
                    return;
                }
            }
        }

        // If we have used up all line search iterations, then the strong Wolfe condition
        // is not met. We choose not to raise an exception (unless no step satisfying
        // sufficient decrease is found), but to return the best step size so far
        if (iter >= param.max_linesearch)
        {
            // throw std::runtime_error("the line search routine reached the maximum number of iterations");

            // First test whether the last step is better than I_lo
            // If yes, return the last step
            const Scalar ft = fx - fx_init - step * test_decr;
            if (ft <= fI_lo)
                return;

            // If not, then the best step size so far is I_lo, but it needs to be positive
            if (I_lo <= Scalar(0))
                throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value");

            // Return everything with _lo
            step = I_lo;
            fx = fx_lo;
            dg = dg_lo;
            // Move {x, grad}_lo back
            x.swap(x_lo);
            grad.swap(grad_lo);
            return;
        }
    }
};

}  // namespace LBFGSpp

#endif  // LBFGSPP_LINE_SEARCH_MORE_THUENTE_H